[llvm] [AMDGPU] Remove the AnnotateKernelFeatures pass (PR #130198)
Jun Wang via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 24 14:40:33 PDT 2025
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/130198
>From d10641a3aed125b19a8ae5e66f17961f760e72eb Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Thu, 6 Mar 2025 14:36:00 -0800
Subject: [PATCH 1/6] [AMDGPU] Remove the AnnotateKernelFeatures pass
Previously the AnnotateKernelFeatures pass infers two attributes:
amdgpu-calls and amdgpu-stack-objects, which are used to help determine
if flat scratch init is allowed. PR #118907 created the
amdgpu-no-flat-scratch-init attribute. Continuing with that work, this
patch makes use of this attribute to determine flat scratch init,
replacing amdgpu-calls and amdgpu-stack-objects. This also leads to the
removal of the AnnotateKernelFeatures pass.
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 3 -
.../AMDGPU/AMDGPUAnnotateKernelFeatures.cpp | 9 -
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 -
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 16 +-
.../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 368 ++++-
.../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 390 ++++-
.../AMDGPU/GlobalISel/extractelement.ll | 71 +-
.../AMDGPU/GlobalISel/flat-scratch-init.ll | 4 +-
...licit-kernarg-backend-usage-global-isel.ll | 36 +-
.../GlobalISel/insertelement-stack-lower.ll | 2 +-
.../AMDGPU/GlobalISel/lds-global-value.ll | 5 +-
.../GlobalISel/llvm.amdgcn.if.break.i64.ll | 3 +
.../GlobalISel/llvm.amdgcn.trig.preop.ll | 24 +
.../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 33 +
.../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 33 +
.../abi-attribute-hints-undefined-behavior.ll | 18 +-
.../AMDGPU/addrspacecast-constantexpr.ll | 62 -
llvm/test/CodeGen/AMDGPU/always-uniform.ll | 3 +
...amdgpu-codegenprepare-fold-binop-select.ll | 3 +
.../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll | 4 +-
.../annotate-kernel-features-hsa-call.ll | 331 ----
.../AMDGPU/annotate-kernel-features-hsa.ll | 165 --
.../AMDGPU/annotate-kernel-features.ll | 103 --
.../attr-amdgpu-flat-work-group-size.ll | 4 +-
.../CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll | 4 +-
.../AMDGPU/attr-amdgpu-waves-per-eu.ll | 4 +-
.../attributor-flatscratchinit-globalisel.ll | 21 +-
llvm/test/CodeGen/AMDGPU/attributor-noopt.ll | 2 +-
.../callee-special-input-sgprs-fixed-abi.ll | 40 +-
llvm/test/CodeGen/AMDGPU/code-object-v3.ll | 12 +-
.../CodeGen/AMDGPU/combine-reg-or-const.ll | 3 +
...dagcomb-extract-vec-elt-different-sizes.ll | 2 +
.../AMDGPU/duplicate-attribute-indirect.ll | 13 -
...cannot-create-empty-or-backward-segment.ll | 2 +-
.../expand-scalar-carry-out-select-user.ll | 3 +
.../CodeGen/AMDGPU/extract_vector_elt-i8.ll | 100 +-
llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 66 +
.../fast-unaligned-load-store.global.ll | 20 +-
llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 236 ++-
.../flat-for-global-subtarget-feature.ll | 7 +-
llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll | 80 +-
.../AMDGPU/fmul-2-combine-multi-use.ll | 48 +
llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 60 +
.../CodeGen/AMDGPU/fneg-modifier-casting.ll | 3 +
llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 62 +-
llvm/test/CodeGen/AMDGPU/half.ll | 231 +++
.../AMDGPU/hsa-metadata-kernel-code-props.ll | 7 +-
llvm/test/CodeGen/AMDGPU/hsa.ll | 4 +-
.../AMDGPU/implicit-kernarg-backend-usage.ll | 36 +-
llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll | 24 +-
.../AMDGPU/insert_vector_elt.v2bf16.ll | 58 +-
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 214 ++-
.../CodeGen/AMDGPU/invalid-addrspacecast.ll | 3 +
.../CodeGen/AMDGPU/invalid-cast-load-i1.ll | 2 +
llvm/test/CodeGen/AMDGPU/kernarg-size.ll | 2 +-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 30 +-
.../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll | 12 +
.../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll | 12 +
.../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 8 +-
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 70 +-
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 114 +-
.../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 126 +-
llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 6 +
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 125 +-
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 83 +-
llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 18 +
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 164 +-
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 129 +-
llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 105 +-
llvm/test/CodeGen/AMDGPU/load-select-ptr.ll | 3 +-
.../CodeGen/AMDGPU/mad24-get-global-id.ll | 2 +-
.../match-perm-extract-vector-elt-bug.ll | 8 +-
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 300 ++--
.../AMDGPU/memory-legalizer-flat-agent.ll | 1380 +++++++++++++++++
.../memory-legalizer-flat-nontemporal.ll | 75 +
.../memory-legalizer-flat-singlethread.ll | 1380 +++++++++++++++++
.../AMDGPU/memory-legalizer-flat-system.ll | 1380 +++++++++++++++++
.../AMDGPU/memory-legalizer-flat-volatile.ll | 66 +
.../AMDGPU/memory-legalizer-flat-wavefront.ll | 1365 ++++++++++++++++
.../AMDGPU/memory-legalizer-flat-workgroup.ll | 1320 ++++++++++++++++
.../AMDGPU/memory-legalizer-global-agent.ll | 273 ++++
.../memory-legalizer-global-nontemporal.ll | 15 +
.../memory-legalizer-global-singlethread.ll | 276 ++++
.../AMDGPU/memory-legalizer-global-system.ll | 261 ++++
.../memory-legalizer-global-volatile.ll | 18 +
.../memory-legalizer-global-wavefront.ll | 276 ++++
.../memory-legalizer-global-workgroup.ll | 276 ++++
.../memory-legalizer-local-nontemporal.ll | 9 +
.../AMDGPU/memory-legalizer-local-volatile.ll | 6 +
.../memory-legalizer-private-nontemporal.ll | 59 +-
.../memory-legalizer-private-volatile.ll | 30 +-
llvm/test/CodeGen/AMDGPU/min.ll | 210 +++
llvm/test/CodeGen/AMDGPU/pack.v2f16.ll | 21 +
llvm/test/CodeGen/AMDGPU/pack.v2i16.ll | 18 +
.../AMDGPU/pal-simple-indirect-call.ll | 8 -
...al-regcopy-and-spill-missed-at-regalloc.ll | 24 +-
.../AMDGPU/preload-implicit-kernargs.ll | 178 +--
llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 379 +++--
llvm/test/CodeGen/AMDGPU/sad.ll | 114 +-
.../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll | 16 +
.../scc-clobbered-sgpr-to-vmem-spill.ll | 464 +++---
.../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll | 2 +-
llvm/test/CodeGen/AMDGPU/shift-i128.ll | 24 +-
.../CodeGen/AMDGPU/simple-indirect-call.ll | 15 -
llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 70 +-
.../CodeGen/AMDGPU/spill-vector-superclass.ll | 2 +-
llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 6 +
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll | 2 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll | 2 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll | 2 +-
llvm/test/CodeGen/AMDGPU/trap-abis.ll | 16 +-
llvm/test/CodeGen/AMDGPU/udiv.ll | 42 +
llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 91 +-
.../AMDGPU/vgpr-spill-placement-issue61083.ll | 2 +-
...ine-function-info-long-branch-reg-debug.ll | 7 +-
.../machine-function-info-long-branch-reg.ll | 7 +-
116 files changed, 12738 insertions(+), 1830 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index f5c2b09c84806..43a49f041eb14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -95,11 +95,8 @@ void initializeAMDGPUDAGToDAGISelLegacyPass(PassRegistry &);
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
-Pass *createAMDGPUAnnotateKernelFeaturesPass();
Pass *createAMDGPUAttributorLegacyPass();
void initializeAMDGPUAttributorLegacyPass(PassRegistry &);
-void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
-extern char &AMDGPUAnnotateKernelFeaturesID;
// DPP/Iterative option enables the atomic optimizer with given strategy
// whereas None disables the atomic optimizer.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index a9bd41382c255..9c9fa5c6e2f0f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -52,11 +52,6 @@ class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
char AMDGPUAnnotateKernelFeatures::ID = 0;
-char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
-
-INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
- "Add AMDGPU function attributes", false, false)
-
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
bool HaveStackObjects = false;
bool Changed = false;
@@ -131,7 +126,3 @@ bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
TM = &TPC->getTM<TargetMachine>();
return false;
}
-
-Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
- return new AMDGPUAnnotateKernelFeatures();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d0454cce15756..f5b033792f5a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -511,7 +511,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAlwaysInlinePass(*PR);
initializeAMDGPUSwLowerLDSLegacyPass(*PR);
initializeAMDGPUAttributorLegacyPass(*PR);
- initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
initializeAMDGPUAtomicOptimizerPass(*PR);
@@ -1294,12 +1293,6 @@ void AMDGPUPassConfig::addIRPasses() {
}
void AMDGPUPassConfig::addCodeGenPrepare() {
- if (TM->getTargetTriple().isAMDGCN()) {
- // FIXME: This pass adds 2 hacky attributes that can be replaced with an
- // analysis, and should be removed.
- addPass(createAMDGPUAnnotateKernelFeaturesPass());
- }
-
if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 53f5c1efd14eb..a108300e336ce 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -601,12 +601,6 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
const CallingConv::ID CC = F.getCallingConv();
const bool IsKernel =
CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
- // FIXME: Should have analysis or something rather than attribute to detect
- // calls.
- const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
- // FIXME: This attribute is a hack, we just need an analysis on the function
- // to look for allocas.
- const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
KernargSegmentPtr = true;
@@ -629,12 +623,14 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
DispatchID = true;
}
- // TODO: This could be refined a lot. The attribute is a poor way of
- // detecting calls or stack objects that may require it before argument
- // lowering.
+ const bool IsNoFlatScratchInitSet = F.hasFnAttribute("amdgpu-no-flat-scratch-init");
+
if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
(IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
- (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
+ // The line below: If enableFlatScratch() is true, whether
+ // no-flat-scratch-init is set is not important. If enableFlatScratch()
+ // is false, FlatScratchInit cannot be true for graphics CC.
+ (ST.enableFlatScratch() || (!IsNoFlatScratchInitSet && !AMDGPU::isGraphics(CC))) &&
!ST.flatScratchIsArchitected()) {
FlatScratchInit = true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index dce4048a4b87e..ac24f81136fd6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -97,11 +103,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -112,11 +121,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -287,6 +299,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_dec_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -302,6 +317,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_dec_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -359,6 +377,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_dec_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -376,6 +397,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_dec_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -436,6 +460,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
; CI-LABEL: global_atomic_dec_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -453,6 +480,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
; VI-LABEL: global_atomic_dec_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -513,6 +543,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_dec_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -525,6 +558,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_dec_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -575,6 +611,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_dec_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -589,6 +628,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_dec_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -642,6 +684,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
; CI-LABEL: global_atomic_dec_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -656,6 +701,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
; VI-LABEL: global_atomic_dec_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -710,7 +758,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -718,6 +768,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -732,7 +783,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -740,6 +793,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -802,6 +856,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -819,6 +876,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -878,6 +938,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -893,6 +956,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -908,6 +974,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -922,6 +990,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_ret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -958,6 +1030,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -975,6 +1050,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -992,6 +1070,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1006,6 +1086,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1045,6 +1129,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_dec_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -1062,6 +1149,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_dec_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -1079,6 +1169,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1093,6 +1185,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1132,6 +1228,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1144,6 +1243,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1156,6 +1258,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -1167,6 +1271,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1199,6 +1307,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -1213,6 +1324,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -1227,6 +1341,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -1238,6 +1354,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1273,6 +1393,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -1287,6 +1410,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -1301,6 +1427,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -1312,6 +1440,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1348,7 +1480,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1356,6 +1490,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -1370,7 +1505,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1378,6 +1515,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -1392,6 +1530,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1410,6 +1550,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 42
@@ -1466,6 +1610,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -1483,6 +1630,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1500,6 +1650,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
@@ -1513,6 +1665,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1559,10 +1715,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1580,10 +1739,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1601,7 +1763,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_ret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -1616,6 +1780,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_ret_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1654,12 +1822,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1677,12 +1848,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1700,7 +1874,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_ret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -1715,6 +1891,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_ret_i64_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1756,10 +1936,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1769,10 +1952,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1782,7 +1968,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -1794,6 +1982,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1828,12 +2020,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1843,12 +2038,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1858,7 +2056,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -1870,6 +2070,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i64_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1907,12 +2111,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1922,12 +2129,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1937,7 +2147,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -1949,6 +2161,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1987,6 +2203,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2013,6 +2232,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2039,12 +2261,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2058,6 +2282,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2116,6 +2344,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -2134,6 +2365,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2152,12 +2386,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2166,6 +2402,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2219,8 +2459,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_store_dword v[0:1], v3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2237,8 +2480,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2312,7 +2558,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -2328,7 +2577,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -2394,7 +2646,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -2410,7 +2665,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -2594,10 +2852,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_dec_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2610,10 +2871,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_dec_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2671,12 +2935,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_dec_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2689,12 +2956,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_dec_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2753,12 +3023,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
; CI-LABEL: global_atomic_dec_ret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2771,12 +3044,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
; VI-LABEL: global_atomic_dec_ret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2835,10 +3111,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_dec_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2848,10 +3127,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_dec_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2902,12 +3184,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_dec_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2917,12 +3202,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_dec_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2974,12 +3262,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
; CI-LABEL: global_atomic_dec_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2989,12 +3280,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
; VI-LABEL: global_atomic_dec_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -3047,6 +3341,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -3070,6 +3367,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -3144,6 +3444,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -3162,6 +3465,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -3232,7 +3538,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v4, s3
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: flat_store_dword v[3:4], v0
@@ -3251,7 +3560,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dword v[3:4], v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index e2d179a77f76c..23c267e7d184e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -21,11 +21,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -36,11 +39,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -110,11 +116,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -125,11 +134,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -332,6 +344,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_inc_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -347,6 +362,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_inc_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -415,6 +433,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_inc_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -432,6 +453,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_inc_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -503,6 +527,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -520,6 +547,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -592,6 +622,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_inc_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -604,6 +637,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_inc_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -664,6 +700,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_inc_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -678,6 +717,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_inc_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -741,6 +783,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
; CI-LABEL: global_atomic_inc_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -755,6 +800,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
; VI-LABEL: global_atomic_inc_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -820,7 +868,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -828,6 +878,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -842,7 +893,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -850,6 +903,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -925,6 +979,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -942,6 +999,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1019,8 +1079,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_store_dword v[0:1], v3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1037,8 +1100,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1129,7 +1195,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -1145,7 +1214,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1224,7 +1296,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -1240,7 +1315,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1459,10 +1537,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_inc_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1475,10 +1556,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_inc_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1548,12 +1632,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_inc_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1566,12 +1653,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_inc_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1642,12 +1732,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
; CI-LABEL: global_atomic_inc_ret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1660,12 +1753,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
; VI-LABEL: global_atomic_inc_ret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1737,10 +1833,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_inc_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1750,10 +1849,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_inc_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1815,12 +1917,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_inc_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1830,12 +1935,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_inc_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1898,12 +2006,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
; CI-LABEL: global_atomic_inc_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1913,12 +2024,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
; VI-LABEL: global_atomic_inc_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1983,6 +2097,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2006,6 +2123,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2094,6 +2214,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -2112,6 +2235,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2188,6 +2314,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -2203,6 +2332,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -2218,6 +2350,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2232,6 +2366,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_ret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2281,6 +2419,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -2298,6 +2439,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -2315,6 +2459,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2329,6 +2475,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_ret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2381,6 +2531,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_inc_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -2398,6 +2551,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_inc_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -2415,6 +2571,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2429,6 +2587,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2482,6 +2644,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2494,6 +2659,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2506,6 +2674,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -2517,6 +2687,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2560,6 +2734,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -2574,6 +2751,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -2588,6 +2768,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -2599,6 +2781,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2645,6 +2831,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -2659,6 +2848,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -2673,6 +2865,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -2684,6 +2878,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2732,7 +2930,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2740,6 +2940,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -2754,7 +2955,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2762,6 +2965,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -2776,6 +2980,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2794,6 +3000,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 42
@@ -2871,6 +3081,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -2888,6 +3101,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2905,6 +3121,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
@@ -2918,6 +3136,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2988,7 +3210,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v4, s3
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: flat_store_dword v[3:4], v0
@@ -3007,7 +3232,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dword v[3:4], v0
@@ -3097,10 +3325,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3118,10 +3349,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3139,7 +3373,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_ret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -3154,6 +3390,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_ret_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3206,12 +3446,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3229,12 +3472,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3252,7 +3498,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_ret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -3267,6 +3515,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_ret_i64_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3322,12 +3574,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_inc_ret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3345,12 +3600,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_inc_ret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3368,7 +3626,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -3383,6 +3643,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3439,10 +3703,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3452,10 +3719,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3465,7 +3735,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -3477,6 +3749,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3523,12 +3799,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3538,12 +3817,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3553,7 +3835,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -3565,6 +3849,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3614,12 +3902,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3629,12 +3920,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3644,7 +3938,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -3656,6 +3952,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3707,6 +4007,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -3733,6 +4036,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -3759,12 +4065,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3778,6 +4086,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -3858,6 +4170,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -3876,6 +4191,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -3894,12 +4212,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3908,6 +4228,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -3975,6 +4299,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s4
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0
@@ -3982,6 +4307,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -3995,6 +4322,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0
@@ -4002,6 +4330,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 31a229a908142..9ef16aef0dd16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3016,7 +3016,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -3027,7 +3027,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT: user_sgpr_count = 12
+; GPRIDX-NEXT: user_sgpr_count = 14
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -3042,7 +3042,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1
; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1
; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 15
+; GPRIDX-NEXT: wavefront_sgpr_count = 17
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -3107,7 +3107,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -3118,7 +3118,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT: user_sgpr_count = 12
+; MOVREL-NEXT: user_sgpr_count = 14
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -3133,7 +3133,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_sgpr_queue_ptr = 1
; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; MOVREL-NEXT: enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1
; MOVREL-NEXT: enable_sgpr_private_segment_size = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -3150,7 +3150,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 10
+; MOVREL-NEXT: wavefront_sgpr_count = 24
; MOVREL-NEXT: workitem_vgpr_count = 4
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
@@ -3168,21 +3168,24 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; MOVREL-NEXT: s_load_dword s8, s[8:9], 0x8
+; MOVREL-NEXT: s_add_i32 s12, s12, s17
+; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; MOVREL-NEXT: s_mov_b32 s4, 0
; MOVREL-NEXT: s_mov_b32 s5, 0x40080000
-; MOVREL-NEXT: s_mov_b32 s2, 0
-; MOVREL-NEXT: s_mov_b32 s3, 0x40140000
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s8, 1
; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
; MOVREL-NEXT: s_cmp_eq_u32 s8, 2
+; MOVREL-NEXT: s_mov_b32 s2, 0
; MOVREL-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
; MOVREL-NEXT: s_cmp_eq_u32 s8, 3
+; MOVREL-NEXT: s_mov_b32 s3, 0x40140000
; MOVREL-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5]
; MOVREL-NEXT: s_cmp_eq_u32 s8, 4
; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; MOVREL-NEXT: v_mov_b32_e32 v0, s2
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
+; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13
; MOVREL-NEXT: v_mov_b32_e32 v1, s3
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -3210,7 +3213,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT: user_sgpr_count = 12
+; GFX10-NEXT: user_sgpr_count = 14
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -3225,7 +3228,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_sgpr_queue_ptr = 1
; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GFX10-NEXT: enable_sgpr_dispatch_id = 1
-; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1
; GFX10-NEXT: enable_sgpr_private_segment_size = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4042,7 +4045,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT: user_sgpr_count = 12
+; GPRIDX-NEXT: user_sgpr_count = 14
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4057,7 +4060,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1
; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1
; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4074,7 +4077,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 14
+; GPRIDX-NEXT: wavefront_sgpr_count = 16
; GPRIDX-NEXT: workitem_vgpr_count = 2
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -4115,7 +4118,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4126,7 +4129,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT: user_sgpr_count = 12
+; MOVREL-NEXT: user_sgpr_count = 14
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4141,7 +4144,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_sgpr_queue_ptr = 1
; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; MOVREL-NEXT: enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1
; MOVREL-NEXT: enable_sgpr_private_segment_size = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4158,7 +4161,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 10
+; MOVREL-NEXT: wavefront_sgpr_count = 24
; MOVREL-NEXT: workitem_vgpr_count = 3
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
@@ -4176,6 +4179,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dword s2, s[8:9], 0x8
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; MOVREL-NEXT: s_add_i32 s12, s12, s17
+; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s2, 1
; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0
@@ -4211,7 +4217,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT: user_sgpr_count = 12
+; GFX10-NEXT: user_sgpr_count = 14
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4226,7 +4232,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_sgpr_queue_ptr = 1
; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GFX10-NEXT: enable_sgpr_dispatch_id = 1
-; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1
; GFX10-NEXT: enable_sgpr_private_segment_size = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4387,7 +4393,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT: user_sgpr_count = 12
+; GPRIDX-NEXT: user_sgpr_count = 14
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4402,7 +4408,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1
; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1
; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4419,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 14
+; GPRIDX-NEXT: wavefront_sgpr_count = 16
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -4463,7 +4469,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4474,7 +4480,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT: user_sgpr_count = 12
+; MOVREL-NEXT: user_sgpr_count = 14
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4489,7 +4495,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_sgpr_queue_ptr = 1
; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; MOVREL-NEXT: enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1
; MOVREL-NEXT: enable_sgpr_private_segment_size = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4506,7 +4512,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 10
+; MOVREL-NEXT: wavefront_sgpr_count = 24
; MOVREL-NEXT: workitem_vgpr_count = 4
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
@@ -4524,10 +4530,12 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dword s6, s[8:9], 0x8
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; MOVREL-NEXT: s_add_i32 s12, s12, s17
+; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; MOVREL-NEXT: s_mov_b32 s2, 0
-; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s6, 1
+; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
; MOVREL-NEXT: s_cmp_eq_u32 s6, 2
; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
@@ -4535,6 +4543,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
; MOVREL-NEXT: v_mov_b32_e32 v0, s2
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
+; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13
; MOVREL-NEXT: v_mov_b32_e32 v1, s3
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -4562,7 +4571,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT: user_sgpr_count = 12
+; GFX10-NEXT: user_sgpr_count = 14
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4577,7 +4586,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_sgpr_queue_ptr = 1
; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GFX10-NEXT: enable_sgpr_dispatch_id = 1
-; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1
; GFX10-NEXT: enable_sgpr_private_segment_size = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
index 00c44c27257bb..e207d95287783 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
@@ -35,7 +35,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
; RO-FLAT: scratch_store_dword
; RW-FLAT: .amdhsa_user_sgpr_private_segment_buffer 1
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
-; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
+; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 0
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; RW-FLAT-NOT: .amdhsa_enable_private_segment
@@ -43,7 +43,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
; RO-FLAT: .amdhsa_enable_private_segment 1
; RW-FLAT: .amdhsa_reserve_flat_scratch 0
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
-; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 4
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
%alloca = alloca i32, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 676035735d0af..64cdf577a3db9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -12,7 +12,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT: v_mov_b32_e32 v2, 1
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_mov_b32 s4, s0
; GFX8V4-NEXT: s_mov_b32 s5, s3
@@ -23,6 +25,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1
; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[6:7], 0
+; GFX8V4-NEXT: v_mov_b32_e32 v2, 1
; GFX8V4-NEXT: v_mov_b32_e32 v1, s5
; GFX8V4-NEXT: flat_store_dword v[0:1], v2
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
@@ -37,7 +40,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0xc8
-; GFX8V5-NEXT: v_mov_b32_e32 v2, 1
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_mov_b32 s4, s0
; GFX8V5-NEXT: s_mov_b32 s5, s2
@@ -47,6 +52,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[2:3], 0
+; GFX8V5-NEXT: v_mov_b32_e32 v2, 1
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
; GFX8V5-NEXT: flat_store_dword v[0:1], v2
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
@@ -60,9 +66,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V4-LABEL: addrspacecast:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX9V4-NEXT: v_mov_b32_e32 v2, 1
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V4-NEXT: s_mov_b32 s2, s0
; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1
@@ -71,6 +78,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1
; GFX9V4-NEXT: v_mov_b32_e32 v0, s2
; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V4-NEXT: v_mov_b32_e32 v2, 1
; GFX9V4-NEXT: v_mov_b32_e32 v1, s3
; GFX9V4-NEXT: flat_store_dword v[0:1], v2
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
@@ -84,9 +92,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V5-LABEL: addrspacecast:
; GFX9V5: ; %bb.0:
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX9V5-NEXT: v_mov_b32_e32 v2, 1
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V5-NEXT: s_mov_b32 s2, s0
; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1
@@ -95,6 +104,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1
; GFX9V5-NEXT: v_mov_b32_e32 v0, s2
; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V5-NEXT: v_mov_b32_e32 v2, 1
; GFX9V5-NEXT: v_mov_b32_e32 v1, s3
; GFX9V5-NEXT: flat_store_dword v[0:1], v2
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
@@ -117,6 +127,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0
@@ -130,6 +143,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0
@@ -173,6 +189,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0
@@ -186,6 +205,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0
@@ -269,7 +291,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V4: ; %bb.0:
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
; GFX8V4-NEXT: v_mov_b32_e32 v0, s6
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8V4-NEXT: v_mov_b32_e32 v1, s7
; GFX8V4-NEXT: s_add_u32 s0, s8, 8
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -295,7 +320,10 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V5: ; %bb.0:
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
; GFX8V5-NEXT: v_mov_b32_e32 v0, s6
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8V5-NEXT: v_mov_b32_e32 v1, s7
; GFX8V5-NEXT: s_add_u32 s0, s8, 8
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index 378c6312c52be..94853767ccfac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[20:23], s[8:9], 0x0
; GCN-NEXT: s_load_dwordx2 s[24:25], s[8:9], 0x10
-; GCN-NEXT: s_add_u32 s0, s0, s15
+; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: v_mov_b32_e32 v64, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
index a6a7f35a774db..859f7ef16e395 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -11,13 +11,16 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace(
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 4
; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_add_i32 s12, s12, s17
; CHECK-NEXT: ds_read_b32 v2, v0
-; CHECK-NEXT: v_mov_b32_e32 v3, 9
+; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_add_u32 s0, s0, 4
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_mov_b32_e32 v3, 9
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: v_mov_b32_e32 v0, 0x200
; CHECK-NEXT: ds_write_b32 v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
index dcc2c23cae046..a5a75f74833f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
@@ -6,6 +6,9 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) {
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s2, s[8:9], 0x0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0xa
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
index ad588ebee2f9e..1deee215e522b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
@@ -42,6 +42,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -59,6 +62,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -76,6 +82,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -85,6 +93,10 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
;
; GFX10-LABEL: s_trig_preop_f64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
@@ -113,6 +125,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
; CI-LABEL: s_trig_preop_f64_imm:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; CI-NEXT: s_add_u32 s0, s0, 4
@@ -128,6 +143,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
; VI-LABEL: s_trig_preop_f64_imm:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; VI-NEXT: s_add_u32 s0, s0, 4
@@ -143,6 +161,8 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
; GFX9-LABEL: s_trig_preop_f64_imm:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
@@ -151,6 +171,10 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
;
; GFX10-LABEL: s_trig_preop_f64_imm:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 40f29c56c8f12..b59f85b2dfa38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -7,6 +7,9 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: sdivrem_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s6, s5, 31
; GFX8-NEXT: s_add_i32 s0, s5, s6
@@ -146,6 +149,9 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: sdivrem_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s2, s9, 31
; GFX8-NEXT: s_ashr_i32 s12, s11, 31
@@ -617,6 +623,9 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-LABEL: sdivrem_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s2, s10, 31
; GFX8-NEXT: s_add_i32 s0, s10, s2
@@ -845,6 +854,9 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
; GFX8-LABEL: sdivrem_v4i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1271,6 +1283,9 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
; GFX8-LABEL: sdivrem_v2i64:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2187,6 +2202,9 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
; GFX8-LABEL: sdiv_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -2332,6 +2350,9 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: sdivrem_v2i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010
; GFX8-NEXT: s_ashr_i32 s3, s0, 31
@@ -2596,6 +2617,9 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
; GFX8-LABEL: sdiv_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -2741,6 +2765,9 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-LABEL: sdivrem_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sext_i32_i16 s0, s3
; GFX8-NEXT: s_ashr_i32 s10, s0, 31
@@ -3002,6 +3029,9 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
; GFX8-LABEL: sdivrem_i3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -3153,6 +3183,9 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: sdivrem_i27:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index e3c1a52696b47..ff0114cfc3ddb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -7,6 +7,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: udivrem_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
; GFX8-NEXT: s_sub_i32 s0, 0, s5
@@ -113,6 +116,9 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: udivrem_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10
@@ -523,6 +529,9 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-LABEL: udivrem_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11
@@ -685,6 +694,9 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
; GFX8-LABEL: udivrem_v4i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -980,6 +992,9 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
; GFX8-LABEL: udivrem_v2i64:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x20
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1772,6 +1787,9 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
; GFX8-LABEL: udiv_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
@@ -1885,6 +1903,9 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s0, s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
@@ -2081,6 +2102,9 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
; GFX8-LABEL: udiv_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s5, s4, 16
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
@@ -2194,6 +2218,9 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s2, s1, 0xffff
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
@@ -2387,6 +2414,9 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
; GFX8-LABEL: udivrem_i3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
@@ -2505,6 +2535,9 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: udivrem_i27:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index b3b4959678855..7a7863462357b 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -135,6 +135,9 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr)
; FIXEDABI-LABEL: marked_kernel_use_workitem_id:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; FIXEDABI-NEXT: s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0)
; FIXEDABI-NEXT: v_mov_b32_e32 v4, s1
; FIXEDABI-NEXT: v_mov_b32_e32 v3, s0
@@ -181,16 +184,19 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr)
; FIXEDABI-LABEL: marked_kernel_use_workgroup_id:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s6
+; FIXEDABI-NEXT: s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8
; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0)
; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0
; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1
; FIXEDABI-NEXT: flat_store_dword v[0:1], v2
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s7
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s9
; FIXEDABI-NEXT: flat_store_dword v[0:1], v2
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s10
; FIXEDABI-NEXT: flat_store_dword v[0:1], v2
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
; FIXEDABI-NEXT: s_endpgm
@@ -238,6 +244,9 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
; FIXEDABI: ; %bb.0:
+; FIXEDABI-NEXT: s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; FIXEDABI-NEXT: s_add_u32 s0, s4, 8
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0
@@ -261,7 +270,10 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 {
; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr:
; FIXEDABI: ; %bb.0:
+; FIXEDABI-NEXT: s_add_i32 s4, s4, s9
; FIXEDABI-NEXT: v_mov_b32_e32 v0, 0
+; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-NEXT: v_mov_b32_e32 v1, 0
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
; FIXEDABI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 59bd4e9ac8ce6..3eba47d7d7852 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s
declare void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) nocapture, ptr addrspace(4) nocapture, i32, i1) #0
@@ -27,11 +26,6 @@ define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 {
}
define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4
@@ -42,11 +36,6 @@ define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 {
}
define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.i32 to ptr addrspace(4)), align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.i32 to ptr addrspace(4)), align 4
@@ -57,11 +46,6 @@ define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 {
}
define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4
@@ -92,12 +76,6 @@ define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 {
}
define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat
-; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4
@@ -110,12 +88,6 @@ define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(ptr addrspace
}
define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat
-; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 1 seq_cst, align 4
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 1 seq_cst, align 4
@@ -128,13 +100,6 @@ define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(ptr addr
}
define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat
-; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = cmpxchg ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4
-; AKF_HSA-NEXT: [[VAL0:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
-; AKF_HSA-NEXT: store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = cmpxchg ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4
@@ -149,11 +114,6 @@ define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(ptr addrsp
}
define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat
-; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) align 4 [[OUT]], ptr addrspace(4) align 4 getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 32, i1 false)
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: call void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) align 4 [[OUT]], ptr addrspace(4) align 4 getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 32, i1 false)
@@ -165,11 +125,6 @@ define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(ptr addrspa
; Can't just search the pointer value
define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat
-; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: store ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), ptr addrspace(1) [[OUT]], align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: store ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), ptr addrspace(1) [[OUT]], align 8
@@ -181,11 +136,6 @@ define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(ptr addr
; Can't just search pointer types
define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat
-; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8
@@ -197,11 +147,6 @@ define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat
; Cast group to flat, do GEP, cast back to group
define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: store i32 7, ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)), align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)), align 4
@@ -212,10 +157,6 @@ define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #
}
define ptr addrspace(3) @ret_constant_cast_group_gv_gep_to_flat_to_group() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3))
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3))
@@ -229,14 +170,11 @@ attributes #1 = { nounwind }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
-; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
;.
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
-; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
; ATTRIBUTOR_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index b6c0271e5f56f..4e7022710c671 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -8,8 +8,10 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
; GCN-LABEL: readfirstlane_uniform:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_mov_b32 s5, 0
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s0, s0, s4
@@ -18,6 +20,7 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
; GCN-NEXT: s_add_u32 s0, s2, 40
; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 7fdc012d4f1b5..e71bf15384727 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -393,6 +393,9 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
; GCN-LABEL: select_add_lhs_const_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s0, 0
; GCN-NEXT: s_movk_i32 s0, 0x80
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index 3e19ee5567929..a4fe7121e347d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -2,8 +2,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE
; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
-; TRAP-HANDLER-ENABLE: NumSgprs: 61
-; TRAP-HANDLER-DISABLE: NumSgprs: 77
+; TRAP-HANDLER-ENABLE: NumSgprs: 67
+; TRAP-HANDLER-DISABLE: NumSgprs: 83
define amdgpu_kernel void @amdhsa_trap_num_sgprs(
ptr addrspace(1) %out0, i32 %in0,
ptr addrspace(1) %out1, i32 %in1,
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index 6d205921923d3..8389a8e86cb44 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=AKF_HSA %s
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=ATTRIBUTOR_HSA %s
; TODO: The test contains UB which is refined by the Attributor and should be removed.
@@ -19,12 +18,6 @@ declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0
declare i64 @llvm.amdgcn.dispatch.id() #0
define void @use_workitem_id_x() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_x
-; AKF_HSA-SAME: () #[[ATTR1:[0-9]+]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_x
; ATTRIBUTOR_HSA-SAME: () #[[ATTR1:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
@@ -37,12 +30,6 @@ define void @use_workitem_id_x() #1 {
}
define void @use_workitem_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -55,12 +42,6 @@ define void @use_workitem_id_y() #1 {
}
define void @use_workitem_id_z() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_z
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_z
; ATTRIBUTOR_HSA-SAME: () #[[ATTR3:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
@@ -73,12 +54,6 @@ define void @use_workitem_id_z() #1 {
}
define void @use_workgroup_id_x() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_x
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_x
; ATTRIBUTOR_HSA-SAME: () #[[ATTR4:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -91,12 +66,6 @@ define void @use_workgroup_id_x() #1 {
}
define void @use_workgroup_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR5:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -109,12 +78,6 @@ define void @use_workgroup_id_y() #1 {
}
define void @use_workgroup_id_z() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_z
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_z
; ATTRIBUTOR_HSA-SAME: () #[[ATTR6:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
@@ -127,12 +90,6 @@ define void @use_workgroup_id_z() #1 {
}
define void @use_dispatch_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) poison, align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR7:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -145,12 +102,6 @@ define void @use_dispatch_ptr() #1 {
}
define void @use_queue_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_queue_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[QUEUE_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
-; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[QUEUE_PTR]], ptr addrspace(1) poison, align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_queue_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR8:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[QUEUE_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
@@ -163,12 +114,6 @@ define void @use_queue_ptr() #1 {
}
define void @use_dispatch_id() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_id
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
-; AKF_HSA-NEXT: store volatile i64 [[VAL]], ptr addrspace(1) poison, align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_id
; ATTRIBUTOR_HSA-SAME: () #[[ATTR9:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
@@ -181,14 +126,6 @@ define void @use_dispatch_id() #1 {
}
define void @use_workgroup_id_y_workgroup_id_z() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y_workgroup_id_z
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) poison, align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) poison, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y_workgroup_id_z
; ATTRIBUTOR_HSA-SAME: () #[[ATTR10:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -205,11 +142,6 @@ define void @use_workgroup_id_y_workgroup_id_z() #1 {
}
define void @func_indirect_use_workitem_id_x() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_x
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workitem_id_x()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_x
; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_x()
@@ -220,11 +152,6 @@ define void @func_indirect_use_workitem_id_x() #1 {
}
define void @kernel_indirect_use_workitem_id_x() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workitem_id_x
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workitem_id_x()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workitem_id_x
; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_x()
@@ -235,11 +162,6 @@ define void @kernel_indirect_use_workitem_id_x() #1 {
}
define void @func_indirect_use_workitem_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workitem_id_y()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_y()
@@ -250,11 +172,6 @@ define void @func_indirect_use_workitem_id_y() #1 {
}
define void @func_indirect_use_workitem_id_z() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_z
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workitem_id_z()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_z
; ATTRIBUTOR_HSA-SAME: () #[[ATTR3]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_z()
@@ -265,11 +182,6 @@ define void @func_indirect_use_workitem_id_z() #1 {
}
define void @func_indirect_use_workgroup_id_x() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_x
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workgroup_id_x()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_x
; ATTRIBUTOR_HSA-SAME: () #[[ATTR4]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_x()
@@ -280,11 +192,6 @@ define void @func_indirect_use_workgroup_id_x() #1 {
}
define void @kernel_indirect_use_workgroup_id_x() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workgroup_id_x
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workgroup_id_x()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workgroup_id_x
; ATTRIBUTOR_HSA-SAME: () #[[ATTR4]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_x()
@@ -295,11 +202,6 @@ define void @kernel_indirect_use_workgroup_id_x() #1 {
}
define void @func_indirect_use_workgroup_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workgroup_id_y()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_y()
@@ -310,11 +212,6 @@ define void @func_indirect_use_workgroup_id_y() #1 {
}
define void @func_indirect_use_workgroup_id_z() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_z
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workgroup_id_z()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_z
; ATTRIBUTOR_HSA-SAME: () #[[ATTR6]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_z()
@@ -325,11 +222,6 @@ define void @func_indirect_use_workgroup_id_z() #1 {
}
define void @func_indirect_indirect_use_workgroup_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_indirect_use_workgroup_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @func_indirect_use_workgroup_id_y()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_indirect_use_workgroup_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] {
; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_use_workgroup_id_y()
@@ -340,11 +232,6 @@ define void @func_indirect_indirect_use_workgroup_id_y() #1 {
}
define void @indirect_x2_use_workgroup_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@indirect_x2_use_workgroup_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @func_indirect_indirect_use_workgroup_id_y()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_x2_use_workgroup_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] {
; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_indirect_use_workgroup_id_y()
@@ -355,11 +242,6 @@ define void @indirect_x2_use_workgroup_id_y() #1 {
}
define void @func_indirect_use_dispatch_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_dispatch_ptr()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_dispatch_ptr()
@@ -370,11 +252,6 @@ define void @func_indirect_use_dispatch_ptr() #1 {
}
define void @func_indirect_use_queue_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_queue_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_queue_ptr()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_queue_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_queue_ptr()
@@ -385,11 +262,6 @@ define void @func_indirect_use_queue_ptr() #1 {
}
define void @func_indirect_use_dispatch_id() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_id
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_dispatch_id()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_id
; ATTRIBUTOR_HSA-SAME: () #[[ATTR9]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_dispatch_id()
@@ -400,11 +272,6 @@ define void @func_indirect_use_dispatch_id() #1 {
}
define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y_workgroup_id_z
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @func_indirect_use_workgroup_id_y_workgroup_id_z()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y_workgroup_id_z
; ATTRIBUTOR_HSA-SAME: () #[[ATTR11:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_use_workgroup_id_y_workgroup_id_z()
@@ -415,13 +282,6 @@ define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 {
}
define void @recursive_use_workitem_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@recursive_use_workitem_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4
-; AKF_HSA-NEXT: call void @recursive_use_workitem_id_y()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@recursive_use_workitem_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -436,11 +296,6 @@ define void @recursive_use_workitem_id_y() #1 {
}
define void @call_recursive_use_workitem_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@call_recursive_use_workitem_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @recursive_use_workitem_id_y()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@call_recursive_use_workitem_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: call void @recursive_use_workitem_id_y()
@@ -451,12 +306,6 @@ define void @call_recursive_use_workitem_id_y() #1 {
}
define void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
-; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
-; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
@@ -470,12 +319,6 @@ define void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 {
define void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) %ptr) #2 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9
-; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
-; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
-; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
@@ -488,13 +331,6 @@ define void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) %ptr) #2 {
}
define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) %ptr) #2 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9
-; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR2]] {
-; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
-; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4
-; AKF_HSA-NEXT: call void @func_indirect_use_queue_ptr()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR14:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
@@ -509,11 +345,6 @@ define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) %pt
}
define void @indirect_use_group_to_flat_addrspacecast() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast(ptr addrspace(3) null)
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast(ptr addrspace(3) null)
@@ -524,11 +355,6 @@ define void @indirect_use_group_to_flat_addrspacecast() #1 {
}
define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null)
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9
; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null)
@@ -539,11 +365,6 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 {
}
define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null)
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9
; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null)
@@ -554,12 +375,6 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 {
}
define void @use_kernarg_segment_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_kernarg_segment_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[KERNARG_SEGMENT_PTR]], ptr addrspace(1) poison, align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_kernarg_segment_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] {
; ATTRIBUTOR_HSA-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
@@ -571,11 +386,6 @@ define void @use_kernarg_segment_ptr() #1 {
ret void
}
define void @func_indirect_use_kernarg_segment_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_kernarg_segment_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_kernarg_segment_ptr()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_kernarg_segment_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_kernarg_segment_ptr()
@@ -586,12 +396,6 @@ define void @func_indirect_use_kernarg_segment_ptr() #1 {
}
define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) poison, align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -604,12 +408,6 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
}
define void @use_implicitarg_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) poison, align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -622,11 +420,6 @@ define void @use_implicitarg_ptr() #1 {
}
define void @func_indirect_use_implicitarg_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_implicitarg_ptr()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_implicitarg_ptr()
@@ -640,10 +433,6 @@ declare void @external.func() #3
; This function gets deleted.
define internal void @defined.func() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@defined.func
-; AKF_HSA-SAME: () #[[ATTR3:[0-9]+]] {
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func
; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: ret void
@@ -652,11 +441,6 @@ define internal void @defined.func() #3 {
}
define void @func_call_external() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_call_external
-; AKF_HSA-SAME: () #[[ATTR3]] {
-; AKF_HSA-NEXT: call void @external.func()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external
; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @external.func()
@@ -667,11 +451,6 @@ define void @func_call_external() #3 {
}
define void @func_call_defined() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_call_defined
-; AKF_HSA-SAME: () #[[ATTR3]] {
-; AKF_HSA-NEXT: call void @defined.func()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined
; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] {
; ATTRIBUTOR_HSA-NEXT: call void @defined.func()
@@ -681,11 +460,6 @@ define void @func_call_defined() #3 {
ret void
}
define void @func_call_asm() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_call_asm
-; AKF_HSA-SAME: () #[[ATTR3]] {
-; AKF_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR3]]
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm
; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] {
; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR26:[0-9]+]]
@@ -696,11 +470,6 @@ define void @func_call_asm() #3 {
}
define amdgpu_kernel void @kern_call_external() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_external
-; AKF_HSA-SAME: () #[[ATTR4:[0-9]+]] {
-; AKF_HSA-NEXT: call void @external.func()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external
; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: call void @external.func()
@@ -711,11 +480,6 @@ define amdgpu_kernel void @kern_call_external() #3 {
}
define amdgpu_kernel void @func_kern_defined() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_kern_defined
-; AKF_HSA-SAME: () #[[ATTR4]] {
-; AKF_HSA-NEXT: call void @defined.func()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined
; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @defined.func()
@@ -726,12 +490,6 @@ define amdgpu_kernel void @func_kern_defined() #3 {
}
define i32 @use_dispatch_ptr_ret_type() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr_ret_type
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) poison, align 8
-; AKF_HSA-NEXT: ret i32 0
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr_ret_type
; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] {
; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -744,12 +502,6 @@ define i32 @use_dispatch_ptr_ret_type() #1 {
}
define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[F:%.*]] = call float @use_dispatch_ptr_ret_type()
-; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
-; AKF_HSA-NEXT: ret float [[FADD]]
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func
; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @use_dispatch_ptr_ret_type()
@@ -762,12 +514,6 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 {
}
define float @func_indirect_call(ptr %fptr) #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_call
-; AKF_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR3]] {
-; AKF_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]()
-; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
-; AKF_HSA-NEXT: ret float [[FADD]]
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call
; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]()
@@ -781,12 +527,6 @@ define float @func_indirect_call(ptr %fptr) #3 {
declare float @extern() #3
define float @func_extern_call() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_extern_call
-; AKF_HSA-SAME: () #[[ATTR3]] {
-; AKF_HSA-NEXT: [[F:%.*]] = call float @extern()
-; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
-; AKF_HSA-NEXT: ret float [[FADD]]
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call
; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @extern()
@@ -799,12 +539,6 @@ define float @func_extern_call() #3 {
}
define float @func_null_call(ptr %fptr) #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_null_call
-; AKF_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR3]] {
-; AKF_HSA-NEXT: [[F:%.*]] = call float null()
-; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
-; AKF_HSA-NEXT: ret float [[FADD]]
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call
; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float null()
@@ -820,12 +554,6 @@ declare float @llvm.amdgcn.rcp.f32(float) #0
; Calls some other recognized intrinsic
define float @func_other_intrinsic_call(float %arg) #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call
-; AKF_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
-; AKF_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]])
-; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
-; AKF_HSA-NEXT: ret float [[FADD]]
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call
; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR16]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]])
@@ -839,11 +567,6 @@ define float @func_other_intrinsic_call(float %arg) #3 {
; Hostcall needs to be enabled for sanitizers
define amdgpu_kernel void @kern_sanitize_address() #4 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address
-; AKF_HSA-SAME: () #[[ATTR5:[0-9]+]] {
-; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address
; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4
@@ -855,11 +578,6 @@ define amdgpu_kernel void @kern_sanitize_address() #4 {
; Hostcall needs to be enabled for sanitizers
define void @func_sanitize_address() #4 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_sanitize_address
-; AKF_HSA-SAME: () #[[ATTR5]] {
-; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address
; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4
@@ -871,11 +589,6 @@ define void @func_sanitize_address() #4 {
; Hostcall needs to be enabled for sanitizers
define void @func_indirect_sanitize_address() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address
-; AKF_HSA-SAME: () #[[ATTR3]] {
-; AKF_HSA-NEXT: call void @func_sanitize_address()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address
; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address()
@@ -887,11 +600,6 @@ define void @func_indirect_sanitize_address() #3 {
; Hostcall needs to be enabled for sanitizers
define amdgpu_kernel void @kern_indirect_sanitize_address() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address
-; AKF_HSA-SAME: () #[[ATTR4]] {
-; AKF_HSA-NEXT: call void @func_sanitize_address()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address
; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] {
; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address()
@@ -906,11 +614,6 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 {
declare void @extern_func_sanitize_address() #5
define amdgpu_kernel void @kern_decl_sanitize_address() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address
-; AKF_HSA-SAME: () #[[ATTR4]] {
-; AKF_HSA-NEXT: call void @extern_func_sanitize_address()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address
; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: call void @extern_func_sanitize_address()
@@ -923,10 +626,6 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 {
declare void @enqueue_block_decl() #6
define internal void @enqueue_block_def() #6 {
-; AKF_HSA-LABEL: define {{[^@]+}}@enqueue_block_def
-; AKF_HSA-SAME: () #[[ATTR7:[0-9]+]] {
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def
; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: ret void
@@ -935,11 +634,6 @@ define internal void @enqueue_block_def() #6 {
}
define amdgpu_kernel void @kern_call_enqueued_block_decl() {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl
-; AKF_HSA-SAME: () #[[ATTR8:[0-9]+]] {
-; AKF_HSA-NEXT: call void @enqueue_block_decl()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl
; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_decl()
@@ -950,11 +644,6 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() {
}
define amdgpu_kernel void @kern_call_enqueued_block_def() {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def
-; AKF_HSA-SAME: () #[[ATTR8]] {
-; AKF_HSA-NEXT: call void @enqueue_block_def()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def
; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_def()
@@ -965,9 +654,6 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() {
}
define void @unused_enqueue_block() {
-; AKF_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block() {
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block
; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: ret void
@@ -976,9 +662,6 @@ define void @unused_enqueue_block() {
}
define internal void @known_func() {
-; AKF_HSA-LABEL: define {{[^@]+}}@known_func() {
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func
; ATTRIBUTOR_HSA-SAME: () #[[ATTR25]] {
; ATTRIBUTOR_HSA-NEXT: ret void
@@ -988,11 +671,6 @@ define internal void @known_func() {
; Should never happen
define amdgpu_kernel void @kern_callsite_enqueue_block() {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block
-; AKF_HSA-SAME: () #[[ATTR8]] {
-; AKF_HSA-NEXT: call void @known_func() #[[ATTR7]]
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block
; ATTRIBUTOR_HSA-SAME: () #[[ATTR24]] {
; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR27:[0-9]+]]
@@ -1014,15 +692,6 @@ attributes #6 = { "enqueued-block" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
-; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; AKF_HSA: attributes #[[ATTR1]] = { nounwind "target-cpu"="fiji" }
-; AKF_HSA: attributes #[[ATTR2]] = { nounwind "target-cpu"="gfx900" }
-; AKF_HSA: attributes #[[ATTR3]] = { nounwind }
-; AKF_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-calls" }
-; AKF_HSA: attributes #[[ATTR5]] = { nounwind sanitize_address }
-; AKF_HSA: attributes #[[ATTR6:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" }
-; AKF_HSA: attributes #[[ATTR7]] = { "enqueued-block" }
-; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" }
;.
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 2809f0957462a..32bb22b699b61 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
@@ -33,12 +32,6 @@ define amdgpu_kernel void @use_tgid_x(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_y
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_y
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -51,14 +44,6 @@ define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -75,14 +60,6 @@ define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -99,12 +76,6 @@ define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_z
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_z
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
@@ -117,14 +88,6 @@ define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_z
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_z
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -141,14 +104,6 @@ define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_y_z
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_y_z
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -165,16 +120,6 @@ define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_x_y_z(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -207,12 +152,6 @@ define amdgpu_kernel void @use_tidig_x(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_y
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_y
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -225,12 +164,6 @@ define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_z(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_z
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_z
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
@@ -259,14 +192,6 @@ define amdgpu_kernel void @use_tidig_x_tgid_x(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -283,16 +208,6 @@ define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
@@ -313,22 +228,6 @@ define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_all_workitems
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
-; AKF_HSA-NEXT: [[VAL3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
-; AKF_HSA-NEXT: [[VAL4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: [[VAL5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_all_workitems
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
@@ -361,13 +260,6 @@ define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_dispatch_ptr(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR10:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -382,13 +274,6 @@ define amdgpu_kernel void @use_dispatch_ptr(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_queue_ptr(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_queue_ptr
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
-; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_queue_ptr
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR11:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
@@ -417,12 +302,6 @@ define amdgpu_kernel void @use_kernarg_segment_ptr(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
-; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
-; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
@@ -435,12 +314,6 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr
}
define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
-; AKF_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
@@ -526,13 +399,6 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 {
}
define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_is_shared
-; AKF_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]])
-; AKF_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32
-; AKF_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) poison, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared
; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]])
@@ -547,13 +413,6 @@ define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 {
}
define amdgpu_kernel void @use_is_private(ptr %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_is_private
-; AKF_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]])
-; AKF_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32
-; AKF_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) poison, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private
; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]])
@@ -568,12 +427,6 @@ define amdgpu_kernel void @use_is_private(ptr %ptr) #1 {
}
define amdgpu_kernel void @use_alloca() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca
-; AKF_HSA-SAME: () #[[ATTR2:[0-9]+]] {
-; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
-; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca
; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
@@ -586,15 +439,6 @@ define amdgpu_kernel void @use_alloca() #1 {
}
define amdgpu_kernel void @use_alloca_non_entry_block() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block
-; AKF_HSA-SAME: () #[[ATTR2]] {
-; AKF_HSA-NEXT: entry:
-; AKF_HSA-NEXT: br label [[BB:%.*]]
-; AKF_HSA: bb:
-; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
-; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block
; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
; ATTRIBUTOR_HSA-NEXT: entry:
@@ -614,12 +458,6 @@ bb:
}
define void @use_alloca_func() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca_func
-; AKF_HSA-SAME: () #[[ATTR2]] {
-; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
-; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func
; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
@@ -638,9 +476,6 @@ attributes #1 = { nounwind }
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
-; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
-; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" }
;.
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index 20ce05278d213..15dc1a0529254 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=CHECK,AKF_CHECK %s
; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=CHECK,ATTRIBUTOR_CHECK %s
declare i32 @llvm.r600.read.tgid.x() #0
@@ -27,12 +26,6 @@ define amdgpu_kernel void @use_tgid_x(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_y
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_y
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y()
@@ -45,14 +38,6 @@ define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y()
@@ -69,14 +54,6 @@ define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x()
@@ -93,12 +70,6 @@ define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_z
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z()
-; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_z
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z()
@@ -111,14 +82,6 @@ define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x()
@@ -135,14 +98,6 @@ define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y()
@@ -159,16 +114,6 @@ define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_x_y_z(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tgid.z()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x()
@@ -201,12 +146,6 @@ define amdgpu_kernel void @use_tidig_x(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_y
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y()
-; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_y
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y()
@@ -219,12 +158,6 @@ define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_z(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_z
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z()
-; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_z
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z()
@@ -253,14 +186,6 @@ define amdgpu_kernel void @use_tidig_x_tgid_x(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y()
@@ -277,16 +202,6 @@ define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y()
-; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x()
@@ -307,22 +222,6 @@ define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_all_workitems
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y()
-; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z()
-; AKF_CHECK-NEXT: [[VAL3:%.*]] = call i32 @llvm.r600.read.tgid.x()
-; AKF_CHECK-NEXT: [[VAL4:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: [[VAL5:%.*]] = call i32 @llvm.r600.read.tgid.z()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_all_workitems
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x()
@@ -394,8 +293,6 @@ attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
;.
-; AKF_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; AKF_CHECK: attributes #[[ATTR1]] = { nounwind }
;.
; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index fc13b86566f76..22cc5af30da66 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -35,9 +35,9 @@ entry:
attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
; CHECK-LABEL: {{^}}min_1024_max_1024
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
; CHECK: VGPRBlocks: 10
-; CHECK: NumSGPRsForWavesPerEU: 2{{$}}
+; CHECK: NumSGPRsForWavesPerEU: 24{{$}}
; CHECK: NumVGPRsForWavesPerEU: 43
@var = addrspace(1) global float 0.0
define amdgpu_kernel void @min_1024_max_1024() #3 {
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index 46edf06c3b62c..d0107eb3ade27 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -4,8 +4,8 @@
; ALL-LABEL: {{^}}max_10_sgprs:
-; ALL: SGPRBlocks: 1
-; ALL: NumSGPRsForWavesPerEU: 10
+; ALL: SGPRBlocks: 2
+; ALL: NumSGPRsForWavesPerEU: 24
define amdgpu_kernel void @max_10_sgprs() #0 {
%one = load volatile i32, ptr addrspace(4) poison
%two = load volatile i32, ptr addrspace(4) poison
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 14519f5a5e77c..4507fd5865989 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
; Exactly 10 waves per execution unit.
; CHECK-LABEL: {{^}}exactly_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 3
; CHECK: VGPRBlocks: 5
-; CHECK: NumSGPRsForWavesPerEU: 20
+; CHECK: NumSGPRsForWavesPerEU: 30
; CHECK: NumVGPRsForWavesPerEU: 24
define amdgpu_kernel void @exactly_10() #9 {
%val0 = load volatile float, ptr addrspace(1) @var
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
index 682a57571d11e..35f0ccf5ba62f 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -392,7 +392,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: call_without_private_to_flat_addrspacecast
; GFX10: argumentInfo:
@@ -420,7 +421,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: call_both_with_and_without_private_to_flat_addrspacecast
; GFX10: argumentInfo:
@@ -434,7 +436,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: call_call_without_private_to_flat_addrspacecast
; GFX10: argumentInfo:
@@ -462,7 +465,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: call_call_both_with_and_without_private_to_flat_addrspacecast
; GFX10: argumentInfo:
@@ -476,7 +480,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: with_cast_call_without_private_to_flat_addrspacecast
; GFX10: argumentInfo:
@@ -490,7 +495,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: with_cast_call_with_private_to_flat_addrspacecast
; GFX10: argumentInfo:
@@ -504,7 +510,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: with_indirect_call
; GFX10: argumentInfo:
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index 55ed11ac62972..748596d51c4ae 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -22,7 +22,7 @@
; NOOPT: .amdhsa_user_sgpr_queue_ptr 1
; NOOPT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; NOOPT: .amdhsa_user_sgpr_dispatch_id 1
-; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0
+; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 1
; NOOPT: .amdhsa_user_sgpr_private_segment_size 0
; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index d5da3e00df1a6..10ca3c9d5f2c8 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -198,11 +198,11 @@ define hidden void @use_workgroup_id_yz() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x:
; GCN-NOT: s6
-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x at rel32@hi+12
+; GCN: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x at rel32@hi+12
; GCN-NOT: s6
-; GCN: s_mov_b32 s12, s6
+; GCN: s_mov_b32 s12, s4
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
; GCN-NEXT: s_endpgm
@@ -217,7 +217,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y:
; GCN-NOT: s12
-; GCN: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s13, s5
; GCN-NOT: s12
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
@@ -233,7 +233,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z:
; GCN-NOT: s12
; GCN-NOT: s13
-; GCN: s_mov_b32 s14, s7
+; GCN: s_mov_b32 s14, s5
; GCN-NOT: s12
; GCN-NOT: s13
@@ -250,8 +250,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy:
; GCN-NOT: s14
-; GCN: s_mov_b32 s12, s6
-; GCN-NEXT: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s12, s4
+; GCN-NEXT: s_mov_b32 s13, s5
; GCN-NOT: s14
; GCN: s_mov_b32 s32, 0
@@ -266,9 +266,9 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz:
-; GCN: s_mov_b32 s12, s6
-; GCN: s_mov_b32 s13, s7
-; GCN: s_mov_b32 s14, s8
+; GCN: s_mov_b32 s12, s4
+; GCN: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s14, s6
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
@@ -283,8 +283,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz:
; GCN-NOT: s13
-; GCN: s_mov_b32 s12, s6
-; GCN-NEXT: s_mov_b32 s14, s7
+; GCN: s_mov_b32 s12, s4
+; GCN-NEXT: s_mov_b32 s14, s5
; GCN-NOT: s13
; GCN: s_mov_b32 s32, 0
@@ -300,8 +300,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz:
-; GCN: s_mov_b32 s13, s7
-; GCN: s_mov_b32 s14, s8
+; GCN: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s14, s6
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
@@ -382,7 +382,7 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
; GCN-NOT: s13
; GCN-NOT: s14
-; GCN-DAG: s_mov_b32 s12, s6
+; GCN-DAG: s_mov_b32 s12, s4
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
; GCN-NOT: s13
; GCN-NOT: s14
@@ -400,7 +400,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y:
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s13, s7
+; GCN-DAG: s_mov_b32 s13, s5
; GCN-DAG: s_mov_b32 s32, 0
; GCN: s_swappc_b64
@@ -415,7 +415,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z:
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s14, s7
+; GCN-DAG: s_mov_b32 s14, s5
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
@@ -474,7 +474,7 @@ define hidden void @use_every_sgpr_input() #1 {
; GCN: .amdhsa_user_sgpr_queue_ptr 1
; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; GCN: .amdhsa_user_sgpr_dispatch_id 1
-; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
+; GCN: .amdhsa_user_sgpr_flat_scratch_init 0
; GCN: .amdhsa_user_sgpr_private_segment_size 0
; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
@@ -499,7 +499,7 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 {
; GCN: .amdhsa_user_sgpr_queue_ptr 1
; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 0
; GCN: .amdhsa_user_sgpr_dispatch_id 1
-; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
+; GCN: .amdhsa_user_sgpr_flat_scratch_init 0
; GCN: .amdhsa_user_sgpr_private_segment_size 0
; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index ee4a2ed883b63..18f1e8e1dbd4b 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -12,13 +12,13 @@
; OSABI-AMDHSA-ASM: .section .rodata,"a"
; OSABI-AMDHSA-ASM: .p2align 6
; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd
-; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 12
+; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 14
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 18
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 1
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
; OSABI-AMDHSA-ASM: .text
@@ -31,13 +31,13 @@
; OSABI-AMDHSA-ASM: .section .rodata,"a"
; OSABI-AMDHSA-ASM: .p2align 6
; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub
-; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 12
+; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 14
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 18
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 1
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
; OSABI-AMDHSA-ASM: .text
diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index c17cf1cd6bca4..c167834470e3b 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -5,6 +5,9 @@
define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
; CHECK-LABEL: _Z11test_kernelPii:
; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-NEXT: s_add_i32 s12, s12, s17
+; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s0, 3
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
index fcb8fa5997b7e..fc17d9288bf40 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
@@ -6,6 +6,8 @@
define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) {
; CHECK-LABEL: eggs:
; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0
; CHECK-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8
; CHECK-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index 39554e05c96b4..f964170ccdda5 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -1,11 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s
define internal void @indirect() {
-; AKF_GCN-LABEL: define {{[^@]+}}@indirect() {
-; AKF_GCN-NEXT: ret void
-;
; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@indirect
; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] {
; ATTRIBUTOR_GCN-NEXT: ret void
@@ -14,14 +10,6 @@ define internal void @indirect() {
}
define amdgpu_kernel void @test_simple_indirect_call() #0 {
-; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
-; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] {
-; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
-; AKF_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8
-; AKF_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8
-; AKF_GCN-NEXT: call void [[FP]]()
-; AKF_GCN-NEXT: ret void
-;
; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] {
; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -40,7 +28,6 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 {
attributes #0 = { "amdgpu-no-dispatch-id" }
;.
-; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" }
;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
index 9104dc68eb9b4..72913d2596ebf 100644
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-NEXT: s_load_dword s6, s[8:9], 0x4
-; CHECK-NEXT: s_add_u32 s24, s24, s15
+; CHECK-NEXT: s_add_u32 s24, s24, s17
; CHECK-NEXT: s_addc_u32 s25, s25, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_bitcmp1_b32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index f3aec696abdee..e6f02295e67d5 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -94,6 +94,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-LABEL: s_add_co_br_user:
; GFX7: ; %bb.0: ; %bb
; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s0, s2, s2
; GFX7-NEXT: s_cmp_lt_u32 s0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 0c25ca5076790..fac9f5bf826a6 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -5,6 +5,9 @@
define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v1i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -18,6 +21,9 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -32,6 +38,9 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i
define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v2i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -54,6 +63,9 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -80,6 +92,9 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i
define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v3i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -102,6 +117,9 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -128,6 +146,9 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i
define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v4i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -150,6 +171,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -176,6 +200,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i
define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v8i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s0, s[8:9], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s0, 16
@@ -192,10 +219,13 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
; VI-LABEL: extract_vector_elt_v8i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v3
@@ -213,6 +243,9 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v16i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x4
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -235,6 +268,9 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -261,6 +297,9 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x
define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v32i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s0, s[8:9], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s0, 16
@@ -277,10 +316,13 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
; VI-LABEL: extract_vector_elt_v32i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v3
@@ -298,6 +340,9 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v64i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x10
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -320,6 +365,9 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x40
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -351,6 +399,9 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x
define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v2i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s2, s[8:9], 0xa
; SI-NEXT: s_load_dword s3, s[8:9], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -370,11 +421,14 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out
; VI-NEXT: s_load_dword s2, s[8:9], 0x4c
; VI-NEXT: s_load_dword s3, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s2, s2, 3
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_lshr_b32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -388,6 +442,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v3i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s2, s[8:9], 0x13
; SI-NEXT: s_load_dword s3, s[8:9], 0xa
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -406,10 +463,13 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out
; VI-NEXT: s_load_dword s2, s[8:9], 0x4c
; VI-NEXT: s_load_dword s3, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s2, s2, 3
; VI-NEXT: s_lshr_b32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -424,6 +484,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out
define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v4i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; SI-NEXT: s_load_dword s4, s[8:9], 0xc
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -442,6 +505,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -463,6 +529,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out
define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v8i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; SI-NEXT: s_load_dword s4, s[8:9], 0x4
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -481,6 +550,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -502,6 +574,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
; SI-LABEL: reduce_load_vector_v8i8_extract_0123:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -526,6 +601,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -558,6 +636,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
; SI-LABEL: reduce_load_vector_v8i8_extract_0145:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -581,6 +662,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s2, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -612,6 +696,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
; SI-LABEL: reduce_load_vector_v8i8_extract_45:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 4
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -628,6 +715,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 4
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -649,6 +739,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
; SI-LABEL: reduce_load_vector_v16i8_extract_0145:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -672,6 +765,9 @@ define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s2, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 365588eaec3ac..2957d0201c223 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -14,6 +14,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -26,6 +29,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -80,6 +86,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -92,6 +101,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -145,6 +157,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -157,6 +172,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -196,6 +214,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; CI-LABEL: s_fabs_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
@@ -209,6 +230,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; VI-LABEL: s_fabs_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
@@ -251,6 +275,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
; CI-LABEL: fabs_fold_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -268,6 +295,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s3
@@ -327,6 +357,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -341,6 +374,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -388,6 +424,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -400,6 +439,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -443,6 +485,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
@@ -469,6 +514,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -525,9 +573,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v0, v[0:1]
; CI-NEXT: s_lshr_b32 s2, s4, 16
@@ -553,9 +604,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -612,6 +666,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -635,6 +692,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -722,6 +782,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -740,6 +803,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 6496b70b4d697..60334e46a4454 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -74,6 +74,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-LABEL: global_store_2xi16_align2:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -90,6 +93,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -216,8 +222,10 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1
-; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -227,6 +235,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2
; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5
; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3
@@ -243,6 +252,9 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -351,6 +363,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-LABEL: global_store_2xi16_align4:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -361,6 +376,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 4e12a30c6f6f4..9919497acea73 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -24,6 +24,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
; GFX678-LABEL: v_test_canonicalize_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -76,6 +79,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s2, s[8:9], 0x2
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX6-NEXT: v_mov_b32_e32 v0, s0
@@ -87,6 +93,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -132,6 +141,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fabs_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -184,6 +196,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -237,6 +252,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fneg_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -289,6 +307,9 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
; GFX678-LABEL: test_fold_canonicalize_undef_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -328,6 +349,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -367,6 +391,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_bfrev_b32_e32 v2, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -409,6 +436,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 1.0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -449,6 +479,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, -1.0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -489,6 +522,9 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
; GFX678-LABEL: test_fold_canonicalize_literal_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -529,6 +565,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -568,10 +607,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: v_mov_b32_e32 v1, s1
; GFX678-NEXT: flat_store_dword v[0:1], v2
; GFX678-NEXT: s_endpgm
@@ -612,10 +654,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: v_mov_b32_e32 v1, s1
; GFX678-NEXT: flat_store_dword v[0:1], v2
; GFX678-NEXT: s_endpgm
@@ -656,10 +701,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: v_mov_b32_e32 v1, s1
; GFX678-NEXT: flat_store_dword v[0:1], v2
; GFX678-NEXT: s_endpgm
@@ -700,6 +748,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -740,6 +791,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_bfrev_b32_e32 v2, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -782,6 +836,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -822,6 +879,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out
; GFX678-LABEL: test_fold_canonicalize_qnan_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -862,6 +922,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -902,6 +965,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -942,6 +1008,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -982,6 +1051,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1022,6 +1094,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1062,6 +1137,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1102,6 +1180,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1
; GFX678-LABEL: v_test_canonicalize_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1153,6 +1234,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
; GFX6-LABEL: s_test_canonicalize_var_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3]
; GFX6-NEXT: v_mov_b32_e32 v0, s0
@@ -1163,6 +1247,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
; GFX8-LABEL: s_test_canonicalize_var_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s0
@@ -1205,6 +1292,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fabs_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1257,6 +1347,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1
; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1310,6 +1403,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fneg_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1362,10 +1458,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, v0
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, v0
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1407,10 +1506,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1450,10 +1552,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1491,10 +1596,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1532,10 +1640,13 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
; GFX678-LABEL: test_fold_canonicalize_literal_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1573,10 +1684,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, v0
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, v0
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1618,10 +1732,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, -1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1662,10 +1779,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1705,10 +1825,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, -1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1749,10 +1872,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
; GFX678-LABEL: test_fold_canonicalize_qnan_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1790,10 +1916,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1831,10 +1960,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1872,10 +2004,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1913,10 +2048,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1954,10 +2092,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1995,10 +2136,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -2037,6 +2181,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2054,6 +2201,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2117,6 +2267,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2134,6 +2287,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2197,6 +2353,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2215,6 +2374,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2279,6 +2441,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2302,6 +2467,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2368,6 +2536,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2385,6 +2556,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2448,6 +2622,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2465,6 +2642,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2529,6 +2709,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2547,6 +2730,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2612,6 +2798,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2635,6 +2824,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2700,6 +2892,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -2717,6 +2912,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index fee6540f43c64..513befe6e19e5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -11,13 +11,14 @@
; ALL-LABEL: {{^}}test:
-; ALL-NOT: flat_scr
+; HSA-DEFAULT: flat_scr
+; HSA-NODEFAULT-NOT: flat_scr
; HSA-DEFAULT: flat_store_dword
; HSA-NODEFAULT: buffer_store_dword
; HSA-NOADDR64: flat_store_dword
-; HSA: .amdhsa_user_sgpr_flat_scratch_init 0
+; HSA: .amdhsa_user_sgpr_flat_scratch_init 1
; NOHSA-DEFAULT: buffer_store_dword
; NOHSA-NODEFAULT: flat_store_dword
@@ -28,6 +29,8 @@ entry:
ret void
}
+; ALL-LABEL: {{^}}test_addr64:
+
; HSA-DEFAULT: flat_store_dword
; HSA-NODEFAULT: buffer_store_dword
; HSA-NOADDR64: flat_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index 45223a24e021a..a59382ba20dc5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -8,28 +8,34 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GCN %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=GFX9-ARCH-FLAT-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=GFX9-ARCH-FLAT-XNACK,GCN %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=GFX10-ARCH-FLAT-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=GFX10-ARCH-FLAT-XNACK,GCN %s
; GCN-LABEL: {{^}}no_vcc_no_flat:
; NOT-HSA-CI: .amdhsa_reserve_xnack_mask
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; TotalNumSgprs: 8
; VI-NOXNACK: ; TotalNumSgprs: 8
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 8
; VI-XNACK: ; TotalNumSgprs: 12
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
+; HSA-VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
define amdgpu_kernel void @no_vcc_no_flat() {
entry:
call void asm sideeffect "", "~{s7}"()
@@ -41,12 +47,18 @@ entry:
; NOT-HSA-CI: .amdhsa_reserve_xnack_mask
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; TotalNumSgprs: 10
; VI-NOXNACK: ; TotalNumSgprs: 10
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 10
; VI-XNACK: ; TotalNumSgprs: 12
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
+; HSA-VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
define amdgpu_kernel void @vcc_no_flat() {
entry:
call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -58,12 +70,18 @@ entry:
; NOT-HSA-CI: .amdhsa_reserve_xnack_mask
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; TotalNumSgprs: 12
; VI-NOXNACK: ; TotalNumSgprs: 14
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
; VI-XNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
+; HSA-VI-XNACK: ; TotalNumSgprs: 24
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
define amdgpu_kernel void @no_vcc_flat() {
entry:
call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -75,12 +93,18 @@ entry:
; NOT-HSA-CI: .amdhsa_reserve_xnack_mask
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; TotalNumSgprs: 12
; VI-NOXNACK: ; TotalNumSgprs: 14
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
; VI-XNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
+; HSA-VI-XNACK: ; TotalNumSgprs: 24
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
define amdgpu_kernel void @vcc_flat() {
entry:
call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -95,12 +119,18 @@ entry:
; NOT-HSA-CI: .amdhsa_reserve_xnack_mask
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: NumSgprs: 4
; VI-NOXNACK: NumSgprs: 6
+; HSA-VI-NOXNACK: NumSgprs: 24
; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
+; HSA-VI-XNACK: NumSgprs: 24
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -115,9 +145,13 @@ entry:
; CI: NumSgprs: 4
; VI-NOXNACK: NumSgprs: 6
+; HSA-VI-NOXNACK: NumSgprs: 24
; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
+; HSA-VI-XNACK: NumSgprs: 24
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr_lo() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -129,12 +163,18 @@ entry:
; NOT-HSA-CI: .amdhsa_reserve_xnack_mask
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: NumSgprs: 4
; VI-NOXNACK: NumSgprs: 6
+; HSA-VI-NOXNACK: NumSgprs: 24
; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
+; HSA-VI-XNACK: NumSgprs: 24
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr_hi() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 64be9cb72a6ee..fb2448fb80744 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -16,6 +16,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
; VI-LABEL: multiple_fadd_use_test_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f32_e64 v0, s3, -1.0
; VI-NEXT: v_add_f32_e64 v1, s2, -1.0
@@ -80,8 +83,11 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-NEXT: s_load_dword s3, s[8:9], 0x2c
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_add_u32 s2, s0, 4
; VI-NEXT: v_add_f32_e64 v2, s4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -139,6 +145,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
; VI-LABEL: multiple_use_fadd_fmad_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s4, s0, 4
@@ -194,6 +203,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s6, s4, 4
; VI-NEXT: v_mov_b32_e32 v0, s1
@@ -255,6 +267,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0
; VI-NEXT: v_mul_f32_e32 v2, s2, v0
@@ -303,10 +318,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-NEXT: v_mul_f32_e32 v2, s2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -350,6 +368,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
; VI-DENORM: ; %bb.0:
; VI-DENORM-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
+; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16
; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0
@@ -368,6 +389,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
; VI-FLUSH: ; %bb.0:
; VI-FLUSH-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
+; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16
; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0
@@ -482,6 +506,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
; VI-DENORM: ; %bb.0:
; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
+; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
@@ -503,6 +530,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
; VI-FLUSH: ; %bb.0:
; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
+; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
@@ -599,6 +629,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
; VI-DENORM: ; %bb.0:
; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
+; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
@@ -620,6 +653,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
; VI-FLUSH: ; %bb.0:
; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
+; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3
@@ -718,6 +754,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; VI-DENORM-NEXT: s_load_dword s6, s[8:9], 0x8
+; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
+; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
@@ -725,6 +763,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s1
; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
+; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-DENORM-NEXT: s_add_u32 s4, s2, 2
; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0
@@ -741,6 +780,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; VI-FLUSH-NEXT: s_load_dword s6, s[8:9], 0x8
+; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
+; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
@@ -748,6 +789,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s1
; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
+; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2
; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0
@@ -847,6 +889,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0
; VI-NEXT: v_mul_f16_e32 v2, s2, v0
@@ -898,10 +943,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 0xc600
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f16_e32 v0, s2, v0
; VI-NEXT: v_mul_f16_e32 v2, s2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 9642b36ecb7e8..eb9eb42df4c78 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -9,6 +9,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
; CI-LABEL: fneg_fabs_fadd_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -26,6 +29,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -85,6 +91,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
; CI-LABEL: fneg_fabs_fmul_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s1, s0, 0x7fff
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -103,6 +112,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -166,6 +178,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_bitset1_b32 s2, 15
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -178,6 +193,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s2, 15
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -233,6 +251,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_bitset1_b32 s2, 15
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -245,6 +266,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s2, 15
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -298,6 +322,9 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
; CIVI-LABEL: v_fneg_fabs_f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -352,6 +379,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
@@ -374,7 +404,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0x4000
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v2, s3
@@ -383,6 +415,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -425,6 +458,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_or_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -437,6 +473,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_or_b32 s2, s2, 0x80008000
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -477,6 +516,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
; CIVI-LABEL: fneg_fabs_v4f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000
; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000
@@ -520,6 +562,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; CI-LABEL: fold_user_fneg_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s0, 16
; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
@@ -541,7 +586,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0xc400
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v2, s3
@@ -549,6 +596,7 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -588,6 +636,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
@@ -605,6 +656,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
@@ -659,6 +713,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010
@@ -683,7 +740,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v5, 0xc400
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_lshr_b32 s1, s4, 16
@@ -692,6 +751,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
; VI-NEXT: v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_mul_f16_e64 v5, |s4|, -4.0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_or_b32_e32 v4, v5, v4
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 781a2ca3146f5..058c273a65d99 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1477,6 +1477,8 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x4
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x6
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_bitcmp1_b32 s6, 0
; GFX7-NEXT: s_cselect_b64 vcc, -1, 0
@@ -1488,6 +1490,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
; GFX7-NEXT: s_cselect_b32 s0, s0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v3, s5
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 23e4ba9fd4ed7..98e0b27cd955d 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -11,6 +11,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x8000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -23,6 +26,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -78,6 +84,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -92,6 +101,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -152,6 +164,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x8000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -164,6 +179,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -217,6 +235,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
; CI-LABEL: v_fneg_fold_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -234,6 +255,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: v_fneg_fold_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -289,6 +313,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -301,6 +328,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -340,14 +370,17 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 {
; CIVI-LABEL: s_fneg_v2f16_nonload:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
; CIVI-NEXT: ;;#ASMSTART
; CIVI-NEXT: ; def s2
; CIVI-NEXT: ;;#ASMEND
; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000
-; CIVI-NEXT: v_mov_b32_e32 v2, s2
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: v_mov_b32_e32 v1, s1
+; CIVI-NEXT: v_mov_b32_e32 v2, s2
; CIVI-NEXT: flat_store_dword v[0:1], v2
; CIVI-NEXT: s_endpgm
;
@@ -388,6 +421,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -402,6 +438,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -449,6 +488,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -461,6 +503,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -501,6 +546,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; CI-LABEL: v_fneg_fold_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -527,6 +575,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; GFX8-LABEL: v_fneg_fold_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -572,6 +623,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
; CI-LABEL: v_extract_fneg_fold_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -593,6 +647,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
; GFX8-LABEL: v_extract_fneg_fold_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -672,6 +729,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0
; CIVI-LABEL: v_extract_fneg_no_fold_v2f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index a2fca33af1046..10573aad38a51 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -10,6 +10,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -21,6 +24,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -46,6 +52,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -57,6 +66,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -81,6 +93,9 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
; CIVI-LABEL: load_v3f16_arg:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_add_u32 s4, s0, 4
; CIVI-NEXT: s_addc_u32 s5, s1, 0
@@ -114,6 +129,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
; CIVI-LABEL: load_v4f16_arg:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v2, s2
@@ -139,6 +157,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -153,6 +174,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -183,6 +207,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s3, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -196,6 +223,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -227,6 +257,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -238,6 +271,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -265,6 +301,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s3, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -278,6 +317,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -308,6 +350,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
; CI-LABEL: extload_v3f16_to_v3f32_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
@@ -321,6 +366,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
; VI-LABEL: extload_v3f16_to_v3f32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
@@ -351,6 +399,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; CI-LABEL: extload_v4f16_to_v4f32_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s3, 16
; CI-NEXT: s_lshr_b32 s5, s2, 16
@@ -366,6 +417,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; VI-LABEL: extload_v4f16_to_v4f32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s3, 16
; VI-NEXT: s_lshr_b32 s5, s2, 16
@@ -401,6 +455,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s6, s1, 16
; CI-NEXT: s_lshr_b32 s7, s0, 16
@@ -429,6 +486,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s6, s1, 16
; VI-NEXT: s_lshr_b32 s7, s0, 16
@@ -485,6 +545,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
; CI-LABEL: extload_f16_to_f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -498,6 +561,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
; VI-LABEL: extload_f16_to_f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -529,6 +595,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
; CI-LABEL: extload_v2f16_to_v2f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s1
@@ -545,6 +614,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
; VI-LABEL: extload_v2f16_to_v2f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s1
@@ -582,6 +654,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
; CI-LABEL: extload_v3f16_to_v3f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
; CI-NEXT: s_lshr_b32 s4, s2, 16
@@ -603,6 +678,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
; VI-LABEL: extload_v3f16_to_v3f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
; VI-NEXT: s_lshr_b32 s4, s2, 16
@@ -648,6 +726,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
; CI-LABEL: extload_v4f16_to_v4f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s3, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
@@ -673,6 +754,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
; VI-LABEL: extload_v4f16_to_v4f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s5, s3, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s3
@@ -726,6 +810,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s6, s3, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s6
@@ -773,6 +860,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s6, s0, 16
; VI-NEXT: s_lshr_b32 s8, s2, 16
@@ -858,6 +948,9 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr
; CIVI-LABEL: global_load_store_f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -886,6 +979,9 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: global_load_store_v2f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -914,6 +1010,9 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add
; CIVI-LABEL: global_load_store_v4f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
@@ -942,6 +1041,9 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: global_load_store_v8f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -970,6 +1072,9 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr
; CIVI-LABEL: global_extload_f16_to_f32:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -1001,6 +1106,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v2f16_to_v2f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1017,6 +1125,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v2f16_to_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1052,6 +1163,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v3f16_to_v3f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1069,6 +1183,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v3f16_to_v3f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1106,6 +1223,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v4f16_to_v4f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1125,6 +1245,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v4f16_to_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1165,6 +1288,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v8f16_to_v8f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1195,6 +1321,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v8f16_to_v8f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1251,6 +1380,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; CI-LABEL: global_extload_v16f16_to_v16f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s4, s2, 16
; CI-NEXT: v_mov_b32_e32 v5, s3
@@ -1309,6 +1441,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; VI-LABEL: global_extload_v16f16_to_v16f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1406,6 +1541,9 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr
; CIVI-LABEL: global_extload_f16_to_f64:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -1440,6 +1578,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v2f16_to_v2f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1458,6 +1599,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v2f16_to_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1498,6 +1642,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v3f16_to_v3f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1523,6 +1670,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v3f16_to_v3f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1574,6 +1724,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v4f16_to_v4f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1602,6 +1755,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v4f16_to_v4f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1659,6 +1815,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v8f16_to_v8f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1707,6 +1866,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v8f16_to_v8f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1791,6 +1953,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; CI-LABEL: global_extload_v16f16_to_v16f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1885,6 +2050,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; VI-LABEL: global_extload_v16f16_to_v16f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2039,6 +2207,9 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p
; CIVI-LABEL: global_truncstore_f32_to_f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -2070,6 +2241,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v2f32_to_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2087,6 +2261,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v2f32_to_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2123,6 +2300,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v3f32_to_v3f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2146,6 +2326,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v3f32_to_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2191,6 +2374,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v4f32_to_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2212,6 +2398,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v4f32_to_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2254,6 +2443,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v8f32_to_v8f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2289,6 +2481,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v8f32_to_v8f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2352,6 +2547,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; CI-LABEL: global_truncstore_v16f32_to_v16f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s4, s2, 32
; CI-NEXT: s_addc_u32 s5, s3, 0
@@ -2420,6 +2618,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; VI-LABEL: global_truncstore_v16f32_to_v16f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s2, 32
; VI-NEXT: s_addc_u32 s5, s3, 0
@@ -2530,6 +2731,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
; CI-LABEL: fadd_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -2547,6 +2751,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s3
@@ -2577,6 +2784,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
; CI-LABEL: fadd_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
@@ -2598,6 +2808,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
; VI-LABEL: fadd_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s3, 16
; VI-NEXT: s_lshr_b32 s5, s2, 16
@@ -2629,6 +2842,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-LABEL: fadd_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2666,6 +2882,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: fadd_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2706,6 +2925,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s10, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v4, s0
@@ -2764,6 +2986,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s10, s7, 16
; VI-NEXT: s_lshr_b32 s11, s3, 16
@@ -2824,6 +3049,9 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr
; CIVI-LABEL: test_bitcast_from_half:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
@@ -2853,6 +3081,9 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
; CIVI-LABEL: test_bitcast_to_half:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 8c017fa5ec263..741ea419c2a45 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,7 +15,8 @@
; CHECK: .max_flat_workgroup_size: 1024
; CHECK: .name: test
; CHECK: .private_segment_fixed_size: 0
-; CHECK: .sgpr_count: 10
+; GFX700: .sgpr_count: 22
+; GFX803: .sgpr_count: 24
; CHECK: .symbol: test.kd
; CHECK: .vgpr_count: {{3|6}}
; WAVE64: .wavefront_size: 64
@@ -48,8 +49,8 @@ entry:
; CHECK: .name: num_spilled_sgprs
; GFX700: .sgpr_spill_count: 10
-; GFX803: .sgpr_spill_count: 10
-; GFX900: .sgpr_spill_count: 62
+; GFX803: .sgpr_spill_count: 0
+; GFX900: .sgpr_spill_count: 0
; GFX1010: .sgpr_spill_count: 60
; CHECK: .symbol: num_spilled_sgprs.kd
define amdgpu_kernel void @num_spilled_sgprs(
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 5a2a976e23846..024593c49dba1 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -43,7 +43,7 @@
; ELF: 00E0: 6E616D65 A673696D 706C65BB 2E707269
; ELF: 00F0: 76617465 5F736567 6D656E74 5F666978
; ELF: 0100: 65645F73 697A6500 AB2E7367 70725F63
-; ELF: 0110: 6F756E74 06B12E73 6770725F 7370696C
+; ELF: 0110: 6F756E74 0EB12E73 6770725F 7370696C
; ELF: 0120: 6C5F636F 756E7400 A72E7379 6D626F6C
; ELF: 0130: A973696D 706C652E 6B64AB2E 76677072
; ELF: 0140: 5F636F75 6E7403B1 2E766770 725F7370
@@ -59,7 +59,7 @@
; ELF: 01E0: 73696D70 6C655F6E 6F5F6B65 726E6172
; ELF: 01F0: 6773BB2E 70726976 6174655F 7365676D
; ELF: 0200: 656E745F 66697865 645F7369 7A6500AB
-; ELF: 0210: 2E736770 725F636F 756E7400 B12E7367
+; ELF: 0210: 2E736770 725F636F 756E740C B12E7367
; ELF: 0220: 70725F73 70696C6C 5F636F75 6E7400A7
; ELF: 0230: 2E73796D 626F6CB5 73696D70 6C655F6E
; ELF: 0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 1547ebd6ce343..c7489e90aec27 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -12,7 +12,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT: v_mov_b32_e32 v4, 1
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1
; GFX8V4-NEXT: s_cselect_b32 s3, s3, 0
@@ -22,6 +24,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V4-NEXT: v_mov_b32_e32 v1, s3
; GFX8V4-NEXT: s_cselect_b32 s0, s2, 0
; GFX8V4-NEXT: s_cselect_b32 s1, s1, 0
+; GFX8V4-NEXT: v_mov_b32_e32 v4, 1
; GFX8V4-NEXT: v_mov_b32_e32 v2, s1
; GFX8V4-NEXT: v_mov_b32_e32 v3, s0
; GFX8V4-NEXT: flat_store_dword v[0:1], v4
@@ -35,7 +38,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0xc8
-; GFX8V5-NEXT: v_mov_b32_e32 v4, 1
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1
; GFX8V5-NEXT: s_cselect_b32 s2, s2, 0
@@ -45,6 +50,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5-NEXT: v_mov_b32_e32 v1, s2
; GFX8V5-NEXT: s_cselect_b32 s0, s3, 0
; GFX8V5-NEXT: s_cselect_b32 s1, s1, 0
+; GFX8V5-NEXT: v_mov_b32_e32 v4, 1
; GFX8V5-NEXT: v_mov_b32_e32 v2, s1
; GFX8V5-NEXT: v_mov_b32_e32 v3, s0
; GFX8V5-NEXT: flat_store_dword v[0:1], v4
@@ -57,9 +63,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V4-LABEL: addrspacecast:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX9V4-NEXT: v_mov_b32_e32 v4, 1
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1
; GFX9V4-NEXT: s_cselect_b32 s2, s3, 0
@@ -69,6 +76,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V4-NEXT: v_mov_b32_e32 v1, s2
; GFX9V4-NEXT: s_cselect_b32 s0, s5, 0
; GFX9V4-NEXT: s_cselect_b32 s1, s1, 0
+; GFX9V4-NEXT: v_mov_b32_e32 v4, 1
; GFX9V4-NEXT: v_mov_b32_e32 v2, s1
; GFX9V4-NEXT: v_mov_b32_e32 v3, s0
; GFX9V4-NEXT: flat_store_dword v[0:1], v4
@@ -81,9 +89,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V5-LABEL: addrspacecast:
; GFX9V5: ; %bb.0:
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX9V5-NEXT: v_mov_b32_e32 v4, 1
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1
; GFX9V5-NEXT: s_cselect_b32 s2, s3, 0
@@ -93,6 +102,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V5-NEXT: v_mov_b32_e32 v1, s2
; GFX9V5-NEXT: s_cselect_b32 s0, s5, 0
; GFX9V5-NEXT: s_cselect_b32 s1, s1, 0
+; GFX9V5-NEXT: v_mov_b32_e32 v4, 1
; GFX9V5-NEXT: v_mov_b32_e32 v2, s1
; GFX9V5-NEXT: v_mov_b32_e32 v3, s0
; GFX9V5-NEXT: flat_store_dword v[0:1], v4
@@ -114,6 +124,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40
; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -126,6 +139,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc
; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -168,6 +184,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44
; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -180,6 +199,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8
; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -263,7 +285,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V4: ; %bb.0:
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
; GFX8V4-NEXT: v_mov_b32_e32 v0, s6
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8V4-NEXT: v_mov_b32_e32 v1, s7
; GFX8V4-NEXT: s_add_u32 s0, s8, 8
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -288,7 +313,10 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V5: ; %bb.0:
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
; GFX8V5-NEXT: v_mov_b32_e32 v0, s6
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8V5-NEXT: v_mov_b32_e32 v1, s7
; GFX8V5-NEXT: s_add_u32 s0, s8, 8
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 2ceaca3497ece..696ea98254086 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,15 +8,15 @@
define amdgpu_kernel void @s_input_output_i128() {
; GFX908-LABEL: name: s_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %12
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %12
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %13
+ ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: s_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %10
- ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %10
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %11
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=s"()
@@ -27,15 +27,15 @@ define amdgpu_kernel void @s_input_output_i128() {
define amdgpu_kernel void @v_input_output_i128() {
; GFX908-LABEL: name: v_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %12
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %12
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %13
+ ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:VReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: v_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %10
- ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %10
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %11
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6619145 /* reguse:VReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=v"()
@@ -46,15 +46,15 @@ define amdgpu_kernel void @v_input_output_i128() {
define amdgpu_kernel void @a_input_output_i128() {
; GFX908-LABEL: name: a_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %12
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %12
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %13
+ ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: a_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %10
- ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %10
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %11
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = call i128 asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index 75db7571444bc..b51cb9df8d784 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
; VI-LABEL: s_insertelement_v2bf16_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -82,6 +85,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
; VI-LABEL: s_insertelement_v2bf16_1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -144,6 +150,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -216,6 +225,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -286,6 +298,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -358,6 +373,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -435,11 +453,14 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1)
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -531,14 +552,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, s4, v0, v4
@@ -611,14 +635,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, v0, s4, v4
@@ -689,14 +716,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, s4, v1, v4
@@ -769,14 +799,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, v1, s4, v4
@@ -853,9 +886,12 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -948,9 +984,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
@@ -1065,9 +1104,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -1245,11 +1287,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -1417,11 +1462,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 49879f66dd852..2cecbe376520d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -21,6 +21,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2i16_0:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -68,6 +71,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -84,6 +90,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -152,6 +161,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -172,6 +184,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -253,6 +268,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -268,6 +286,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -322,6 +343,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -341,6 +365,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -428,6 +455,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -450,6 +480,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -544,6 +577,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2i16_1:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -590,6 +626,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -606,6 +645,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -669,6 +711,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2f16_0:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -714,6 +759,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2f16_1:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -760,6 +808,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -778,6 +829,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -834,9 +888,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -853,9 +910,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -926,6 +986,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -944,6 +1007,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -999,6 +1065,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1017,6 +1086,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1084,6 +1156,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1102,6 +1177,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1169,6 +1247,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1187,6 +1268,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1241,6 +1325,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1259,6 +1346,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1313,6 +1403,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1331,6 +1424,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1399,6 +1495,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1417,6 +1516,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1491,6 +1593,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s4, s[4:5], 0x0
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1510,6 +1615,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1572,9 +1680,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -1593,9 +1704,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -1658,11 +1772,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1685,11 +1802,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_load_dword v4, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1758,14 +1878,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, s4, v0, v4
@@ -1777,9 +1900,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1851,14 +1977,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, v0, s4, v4
@@ -1870,9 +1999,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1944,14 +2076,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, s4, v1, v4
@@ -1963,9 +2098,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2037,14 +2175,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, v1, s4, v4
@@ -2056,9 +2197,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2130,14 +2274,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, s4, v1, v4
@@ -2149,9 +2296,12 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2229,6 +2379,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
@@ -2256,6 +2409,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: flat_load_dword v4, v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
@@ -2359,9 +2515,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -2385,9 +2544,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2454,9 +2616,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
@@ -2474,9 +2639,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -2548,9 +2716,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -2568,9 +2739,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -2674,9 +2848,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -2728,9 +2905,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -2914,11 +3094,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -2941,9 +3124,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s3
; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4
@@ -3034,12 +3220,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT: v_mov_b32_e32 v12, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -3047,6 +3235,7 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
+; VI-NEXT: v_mov_b32_e32 v12, 0x3020504
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_perm_b32 v3, s4, v3, v12
@@ -3060,11 +3249,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -3219,11 +3411,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -3316,11 +3511,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3]
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
index f0609f62a9024..5dff7372ab561 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -7,6 +7,9 @@
define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) {
; CHECK-LABEL: use_group_to_global_addrspacecast:
; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-NEXT: s_add_i32 s12, s12, s17
+; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: flat_store_dword v[0:1], v0
; CHECK-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
index 621187100f323..55a5d50f06bbd 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
@@ -6,6 +6,8 @@ define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index 496a1c652da25..1a32953305bbc 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -7,7 +7,7 @@ declare void @llvm.trap() #0
; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0
; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0
; DOORBELL-NEXT: .amdhsa_kernarg_size 8
-; DOORBELL-NEXT: .amdhsa_user_sgpr_count 12
+; DOORBELL-NEXT: .amdhsa_user_sgpr_count 14
; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
; DOORBELL: .end_amdhsa_kernel
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 4b6cc32522f5b..7179f687c70f2 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -47,11 +47,7 @@
; GCN-O0-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O0-NEXT: Expand reduction intrinsics
-; GCN-O0-NEXT: CallGraph Construction
-; GCN-O0-NEXT: Call Graph SCC Pass Manager
-; GCN-O0-NEXT: AMDGPU Annotate Kernel Features
-; GCN-O0-NEXT: FunctionPass Manager
-; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O0-NEXT: CallGraph Construction
; GCN-O0-NEXT: Call Graph SCC Pass Manager
@@ -232,11 +228,7 @@
; GCN-O1-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; GCN-O1-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O1-NEXT: Expand reduction intrinsics
-; GCN-O1-NEXT: CallGraph Construction
-; GCN-O1-NEXT: Call Graph SCC Pass Manager
-; GCN-O1-NEXT: AMDGPU Annotate Kernel Features
-; GCN-O1-NEXT: FunctionPass Manager
-; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O1-NEXT: CallGraph Construction
; GCN-O1-NEXT: Call Graph SCC Pass Manager
@@ -531,11 +523,7 @@
; GCN-O1-OPTS-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O1-OPTS-NEXT: Expand reduction intrinsics
; GCN-O1-OPTS-NEXT: Early CSE
-; GCN-O1-OPTS-NEXT: CallGraph Construction
-; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager
-; GCN-O1-OPTS-NEXT: AMDGPU Annotate Kernel Features
-; GCN-O1-OPTS-NEXT: FunctionPass Manager
-; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O1-OPTS-NEXT: CallGraph Construction
; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager
@@ -848,11 +836,7 @@
; GCN-O2-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O2-NEXT: Expand reduction intrinsics
; GCN-O2-NEXT: Early CSE
-; GCN-O2-NEXT: CallGraph Construction
-; GCN-O2-NEXT: Call Graph SCC Pass Manager
-; GCN-O2-NEXT: AMDGPU Annotate Kernel Features
-; GCN-O2-NEXT: FunctionPass Manager
-; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O2-NEXT: CallGraph Construction
; GCN-O2-NEXT: Call Graph SCC Pass Manager
@@ -1180,11 +1164,7 @@
; GCN-O3-NEXT: Lazy Block Frequency Analysis
; GCN-O3-NEXT: Optimization Remark Emitter
; GCN-O3-NEXT: Global Value Numbering
-; GCN-O3-NEXT: CallGraph Construction
-; GCN-O3-NEXT: Call Graph SCC Pass Manager
-; GCN-O3-NEXT: AMDGPU Annotate Kernel Features
-; GCN-O3-NEXT: FunctionPass Manager
-; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O3-NEXT: CallGraph Construction
; GCN-O3-NEXT: Call Graph SCC Pass Manager
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index f93d80cc7adf8..4edd0357c6e7a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -30,9 +30,12 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x32
; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -59,10 +62,13 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x32
; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -133,6 +139,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; CI-SDAG: ; %bb.0:
; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1
; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x32
+; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1
; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -166,6 +175,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x32
+; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0
; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index 637d8388cddf1..9d078f7906b4d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -63,9 +63,12 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x33
; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -92,10 +95,13 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x33
; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -200,6 +206,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; CI-SDAG: ; %bb.0:
; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1
; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x33
+; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1
; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -233,6 +242,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x33
+; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0
; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 97219a8f143ce..0fe371c1b51fe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -23,8 +23,11 @@ define void @function_lds_id(ptr addrspace(1) %out) {
define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
; GCN-LABEL: kernel_lds_id:
; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s2, s12, 42
+; GCN-NEXT: s_add_i32 s2, s14, 42
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
@@ -74,6 +77,9 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
; GCN-LABEL: doesnt_use_it:
; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v2, 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 55fa02a0c582c..cc9e34be209b4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -284,6 +284,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -294,6 +297,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -309,10 +315,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -321,10 +330,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -337,10 +349,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -349,11 +364,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -366,12 +384,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_m0:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 m0, -1
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
@@ -379,12 +400,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
; CHECK-GISEL-LABEL: test_readfirstlane_m0:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 m0, -1
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -398,25 +422,31 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 s2, 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -430,13 +460,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -444,13 +477,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -464,13 +500,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -478,13 +517,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index edb6ebcee1325..f2b0959cc706e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -179,6 +179,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -189,6 +192,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -204,10 +210,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -216,10 +225,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -232,10 +244,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -244,11 +259,14 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -262,6 +280,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -281,6 +302,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -311,6 +335,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -332,6 +359,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -365,6 +395,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -386,6 +419,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -419,12 +455,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
; CHECK-SDAG-LABEL: test_readlane_m0_sreg:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 m0, -1
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
@@ -432,12 +471,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
; CHECK-GISEL-LABEL: test_readlane_m0_sreg:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 m0, -1
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -454,11 +496,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v0
; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
@@ -468,10 +513,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; def v0
; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -485,14 +533,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -505,10 +556,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: ; def v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -523,14 +577,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -543,10 +600,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: ; def v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -561,25 +621,31 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou
; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 s2, 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -593,13 +659,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -607,13 +676,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -627,13 +699,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -641,13 +716,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 04d179478590b..4ac2cc98970b5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -15,6 +15,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
@@ -53,6 +56,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
@@ -98,6 +104,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
@@ -147,6 +156,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
@@ -202,6 +214,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
@@ -251,6 +266,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
@@ -306,6 +324,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -348,6 +369,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -396,6 +420,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
@@ -444,6 +471,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -498,11 +528,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
@@ -551,11 +584,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -609,6 +645,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -668,6 +707,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -738,6 +780,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -803,6 +848,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -877,7 +925,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -886,6 +936,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1]
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
@@ -946,7 +997,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
-; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -956,6 +1009,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000
; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0
; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1028,15 +1082,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
; GFX802-SDAG-NEXT: ;;#ASMSTART
; GFX802-SDAG-NEXT: s_mov_b32 m0, -1
; GFX802-SDAG-NEXT: ;;#ASMEND
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: s_mov_b32 s4, m0
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
; GFX802-SDAG-NEXT: s_endpgm
;
@@ -1081,15 +1138,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
; GFX802-GISEL-NEXT: ;;#ASMSTART
; GFX802-GISEL-NEXT: s_mov_b32 m0, -1
; GFX802-GISEL-NEXT: ;;#ASMEND
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: s_mov_b32 s4, m0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX802-GISEL-NEXT: s_endpgm
;
@@ -1138,6 +1198,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -1180,6 +1243,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1227,6 +1293,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
; GFX802-SDAG-LABEL: test_writelane_imm_i64:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
@@ -1270,6 +1339,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
; GFX802-GISEL-LABEL: test_writelane_imm_i64:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -1319,6 +1391,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
; GFX802-SDAG-LABEL: test_writelane_imm_f64:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
@@ -1362,6 +1437,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
; GFX802-GISEL-LABEL: test_writelane_imm_f64:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -1412,6 +1490,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
@@ -1449,6 +1530,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
@@ -1492,10 +1576,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0
@@ -1538,11 +1625,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -1589,10 +1679,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0
@@ -1635,11 +1728,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -1684,7 +1780,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -1716,7 +1815,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1754,11 +1856,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
@@ -1797,11 +1902,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -1845,11 +1953,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
@@ -1888,11 +1999,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 6f95364ac3644..919c1dfd4694e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
; GFX7-HSA-LABEL: constant_load_f64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -90,7 +93,10 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu
;
; GFX7-HSA-LABEL: constant_load_2v4f64:
; GFX7-HSA: ; %bb.0: ; %entry
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index f6e9f152dca5e..a185157a553cf 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -27,6 +27,9 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: constant_load_i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -117,6 +120,9 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v2i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -188,6 +194,9 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v3i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: s_add_u32 s4, s0, 4
@@ -286,6 +295,9 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v4i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -360,6 +372,9 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v8i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -445,6 +460,9 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
; GCN-HSA-LABEL: constant_load_v16i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GCN-HSA-NEXT: s_add_u32 s10, s8, 16
@@ -584,6 +602,9 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-HSA-LABEL: constant_load_v16i16_align2:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
@@ -837,6 +858,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_zextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -912,6 +936,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_sextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -988,6 +1015,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1063,6 +1093,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1137,6 +1170,9 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1222,6 +1258,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1311,6 +1350,9 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1408,6 +1450,9 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1510,6 +1555,9 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1616,6 +1664,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1733,6 +1784,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -1891,6 +1945,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2068,6 +2125,9 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2330,6 +2390,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2637,7 +2700,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3118,7 +3184,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3686,7 +3755,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4602,7 +4674,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -5389,6 +5464,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_zextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5486,6 +5564,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_sextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5584,6 +5665,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5676,6 +5760,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5773,12 +5860,15 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16
; GCN-HSA-NEXT: s_and_b32 s1, s2, 0xffff
@@ -5883,6 +5973,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -5986,10 +6079,13 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16
; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16
@@ -6142,6 +6238,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6298,10 +6397,13 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16
; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16
@@ -6516,6 +6618,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6777,10 +6882,13 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16
; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16
@@ -7162,6 +7270,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -7637,7 +7748,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -8360,7 +8474,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 120f47a277ee6..68a6a148819e8 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -23,6 +23,9 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
; GFX7-HSA-LABEL: constant_load_i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -103,6 +106,9 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v2i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -190,6 +196,9 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v3i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -284,6 +293,9 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v4i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -383,6 +395,9 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v8i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16
@@ -517,6 +532,9 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v9i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s12, s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -678,6 +696,9 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-LABEL: constant_load_v10i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -847,6 +868,9 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-LABEL: constant_load_v11i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -1023,6 +1047,9 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-LABEL: constant_load_v12i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -1202,7 +1229,10 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
;
; GFX7-HSA-LABEL: constant_load_v16i32:
; GFX7-HSA: ; %bb.0: ; %entry
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_add_u32 s18, s16, 48
@@ -1389,6 +1419,9 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX7-HSA-LABEL: constant_zextload_i32_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1473,6 +1506,9 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX7-HSA-LABEL: constant_sextload_i32_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1563,6 +1599,9 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1647,6 +1686,9 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1739,12 +1781,15 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
@@ -1837,6 +1882,9 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1949,13 +1997,16 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
@@ -2082,6 +2133,9 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2244,8 +2298,10 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48
@@ -2253,6 +2309,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11
@@ -2452,6 +2509,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2748,7 +2808,10 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
;
; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3196,7 +3259,10 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
;
; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3628,7 +3694,10 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
;
; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4479,8 +4548,10 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xf0
; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
@@ -4509,6 +4580,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90
; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31
@@ -5097,7 +5169,10 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
;
; GFX7-HSA-LABEL: constant_load_v32i32:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index b3e75e767ae64..2219ceea7ec9b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
; GFX7-LABEL: constant_load_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
@@ -95,6 +98,9 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-LABEL: constant_load_v2i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-NEXT: v_mov_b32_e32 v4, s0
@@ -179,6 +185,9 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-LABEL: constant_load_v3i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
@@ -294,6 +303,9 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-LABEL: constant_load_v4i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-NEXT: s_add_u32 s10, s8, 16
@@ -421,7 +433,10 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX7-LABEL: constant_load_v8i64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-NEXT: s_add_u32 s18, s16, 48
@@ -638,7 +653,10 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
;
; GFX7-LABEL: constant_load_v16i64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index c608bef3f726e..4031be65fab61 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -27,6 +27,9 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
; GFX7-HSA-LABEL: constant_load_i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -112,6 +115,9 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v2i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -195,6 +201,9 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v3i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -305,6 +314,9 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v4i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -374,6 +386,9 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v8i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -448,6 +463,9 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v16i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -529,6 +547,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_zextload_i8_to_i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -604,6 +625,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_sextload_i8_to_i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -680,6 +704,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -755,6 +782,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -834,6 +864,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -933,6 +966,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1030,6 +1066,9 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1131,6 +1170,9 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1232,6 +1274,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1336,6 +1381,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1453,6 +1501,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -1612,6 +1663,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -1794,6 +1848,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2060,6 +2117,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2374,6 +2434,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2856,6 +2919,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3437,7 +3503,10 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
;
; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4353,7 +4422,10 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
;
; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -5161,6 +5233,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_zextload_i8_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5243,6 +5318,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_sextload_i8_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5328,6 +5406,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5408,6 +5489,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5496,6 +5580,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5603,6 +5690,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5716,10 +5806,13 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_bfe_u32 s4, s2, 0x80008
; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 24
@@ -5854,6 +5947,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6013,10 +6109,13 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 24
; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24
@@ -6235,6 +6334,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6504,10 +6606,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s8, s5, 24
; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 24
@@ -6898,6 +7003,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -7387,10 +7495,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s16, s8, 24
; GFX7-HSA-NEXT: s_lshr_b32 s17, s9, 24
@@ -8128,6 +8239,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -8898,6 +9012,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_zextload_i8_to_i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -8982,6 +9099,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_sextload_i8_to_i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9068,6 +9188,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9152,6 +9275,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9241,6 +9367,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9340,6 +9469,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9452,6 +9584,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -9560,6 +9695,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -9683,6 +9821,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -9832,6 +9973,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -10014,6 +10158,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -10261,6 +10408,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -10574,6 +10724,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -11018,6 +11171,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index c5771bc73b945..9054e509cde8e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -28,6 +28,9 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(
; GCN-HSA-LABEL: global_load_i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -133,6 +136,9 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v2i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -219,6 +225,9 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v3i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -339,6 +348,9 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v4i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -424,6 +436,9 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v8i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -512,6 +527,9 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
; GCN-HSA-LABEL: global_load_v16i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
@@ -662,6 +680,9 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
; GCN-HSA-LABEL: global_load_v16i16_align2:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -811,6 +832,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_zextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -896,6 +920,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_sextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -984,6 +1011,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1069,6 +1099,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1159,6 +1192,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1258,6 +1294,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1359,6 +1398,9 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1469,6 +1511,9 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1586,6 +1631,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1701,6 +1749,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1823,6 +1874,9 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1972,6 +2026,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2136,6 +2193,9 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -2372,6 +2432,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2643,6 +2706,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -3054,6 +3120,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -3573,6 +3642,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -4377,6 +4449,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5142,6 +5217,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_zextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5239,6 +5317,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_sextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5334,6 +5415,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5426,6 +5510,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5524,6 +5611,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5633,6 +5723,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5751,6 +5844,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5896,6 +5992,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6056,10 +6155,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6074,8 +6173,11 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v14, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v16, v4
@@ -6275,6 +6377,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6525,10 +6630,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6545,7 +6650,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v18, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v20, v8
@@ -6905,6 +7013,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -7376,6 +7487,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -8078,6 +8192,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 033a66abcedb9..e8c862a3cb93c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -27,6 +27,9 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(
; GCNX3-HSA-LABEL: global_load_i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -106,6 +109,9 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v2i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -186,6 +192,9 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v3i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -270,6 +279,9 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v4i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -352,6 +364,9 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v8i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
@@ -458,6 +473,9 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v9i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -589,6 +607,9 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v10i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -719,6 +740,9 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v11i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -854,6 +878,9 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v12i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -987,6 +1014,9 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v16i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -1134,6 +1164,9 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr
; GCNX3-HSA-LABEL: global_zextload_i32_to_i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1217,6 +1250,9 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr
; GCNX3-HSA-LABEL: global_sextload_i32_to_i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1301,6 +1337,9 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1384,6 +1423,9 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1471,6 +1513,9 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1569,6 +1614,9 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1674,8 +1722,10 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1683,6 +1733,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2
@@ -1800,6 +1851,9 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1941,8 +1995,10 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1957,6 +2013,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9
; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
@@ -2134,6 +2191,9 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2370,6 +2430,9 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2731,8 +2794,10 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -2766,6 +2831,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17
; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
@@ -3122,6 +3188,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -3589,12 +3658,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
;
; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64:
; GCN-GFX900-HSA: ; %bb.0:
-; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-GFX900-HSA-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-GFX900-HSA-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0
-; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15
-; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0
+; GCN-GFX900-HSA-NEXT: s_add_u32 s20, s20, s17
+; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0
; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
@@ -3620,11 +3689,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[20:23], 0 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11
@@ -3667,11 +3736,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[20:23], 0 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51
@@ -3913,6 +3982,9 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -4437,6 +4509,9 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v32i32:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 4dfc773d615e4..1a6fa3c518ca7 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -13,7 +13,8 @@
; GCN: s_cselect_b32
; GCN-NOT: load_dword
-; GCN: flat_load_dwordx2
+; GCN: flat_load_dword
+; GCN: flat_load_dword
; GCN-NOT: load_dword
; GCN: flat_store_dwordx2
diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index 245a2775d9f2f..07b5e1610cfc0 100644
--- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
; GCN-LABEL: {{^}}get_global_id_0:
; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
-; GCN: s_mul_i32 [[MUL:s[0-9]+]], s12, [[WGSIZEX]]
+; GCN: s_mul_i32 [[MUL:s[0-9]+]], s14, [[WGSIZEX]]
; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0
define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 {
%dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index 401724443567a..bdf1668c35673 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -11,8 +11,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX9-NEXT: s_mul_i32 s12, s12, s4
-; GFX9-NEXT: s_add_i32 s5, s5, s12
+; GFX9-NEXT: s_mul_i32 s14, s14, s4
+; GFX9-NEXT: s_add_i32 s5, s5, s14
; GFX9-NEXT: v_add_u32_e32 v0, s5, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1]
@@ -39,8 +39,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX10-NEXT: s_mul_i32 s12, s12, s4
-; GFX10-NEXT: v_add3_u32 v0, s5, s12, v0
+; GFX10-NEXT: s_mul_i32 s14, s14, s4
+; GFX10-NEXT: v_add3_u32 v0, s5, s14, v0
; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1]
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 88ee2a34dd49f..8d020b9e1a603 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -9,6 +9,8 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
; CHECK-LABEL: memcpy_p0_p0_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v12, s3
; CHECK-NEXT: v_mov_b32_e32 v11, s2
@@ -94,12 +96,12 @@ entry:
define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 {
; CHECK-LABEL: memcpy_p5_p4_minsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
-; CHECK-NEXT: s_add_u32 s16, s16, s15
+; CHECK-NEXT: s_add_u32 s20, s20, s17
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
@@ -107,50 +109,50 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
-; CHECK-NEXT: s_addc_u32 s17, s17, 0
+; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: v_mov_b32_e32 v25, s2
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
-; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
-; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
-; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
+; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
-; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
-; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
-; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
+; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
-; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
-; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
-; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
+; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80
; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
-; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
-; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
-; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
+; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
-; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
-; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
-; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
+; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60
+; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
-; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
-; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
-; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
+; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
-; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -160,55 +162,57 @@ entry:
define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 {
; CHECK-LABEL: memcpy_p0_p5_minsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
-; CHECK-NEXT: s_add_u32 s16, s16, s15
-; CHECK-NEXT: s_addc_u32 s17, s17, 0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; CHECK-NEXT: s_add_u32 s20, s20, s17
+; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v26, s0
-; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
-; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
-; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64
@@ -268,6 +272,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v21, s1
; CHECK-NEXT: v_mov_b32_e32 v20, s0
@@ -294,6 +300,8 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-LABEL: memcpy_p0_p0_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v12, s3
; CHECK-NEXT: v_mov_b32_e32 v11, s2
@@ -379,12 +387,12 @@ entry:
define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 {
; CHECK-LABEL: memcpy_p5_p4_optsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
-; CHECK-NEXT: s_add_u32 s16, s16, s15
+; CHECK-NEXT: s_add_u32 s20, s20, s17
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
@@ -392,50 +400,50 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
-; CHECK-NEXT: s_addc_u32 s17, s17, 0
+; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: v_mov_b32_e32 v25, s2
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
-; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
-; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
-; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
+; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
-; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
-; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
-; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
+; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
-; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
-; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
-; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
+; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80
; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
-; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
-; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
-; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
+; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
-; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
-; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
-; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
+; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60
+; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
-; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
-; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
-; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
+; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
-; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -445,55 +453,57 @@ entry:
define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 {
; CHECK-LABEL: memcpy_p0_p5_optsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
-; CHECK-NEXT: s_add_u32 s16, s16, s15
-; CHECK-NEXT: s_addc_u32 s17, s17, 0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; CHECK-NEXT: s_add_u32 s20, s20, s17
+; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v26, s0
-; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
-; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
-; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64
@@ -553,6 +563,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v21, s1
; CHECK-NEXT: v_mov_b32_e32 v20, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 5af37809443e0..07ad8cb0c4a3d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_agent_unordered_load(
; GFX7-LABEL: flat_agent_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX10-WGP-LABEL: flat_agent_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX10-CU-LABEL: flat_agent_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX7-LABEL: flat_agent_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX10-CU-LABEL: flat_agent_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_load(
; GFX7-LABEL: flat_agent_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX10-WGP-LABEL: flat_agent_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -380,6 +417,10 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX10-CU-LABEL: flat_agent_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -410,6 +451,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -423,6 +466,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -531,6 +576,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX7-LABEL: flat_agent_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -547,6 +595,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -565,6 +617,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -598,6 +654,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -612,6 +670,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -739,6 +799,9 @@ entry:
define amdgpu_kernel void @flat_agent_unordered_store(
; GFX7-LABEL: flat_agent_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -750,6 +813,10 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX10-WGP-LABEL: flat_agent_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -761,6 +828,10 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX10-CU-LABEL: flat_agent_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -783,6 +854,8 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -793,6 +866,8 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -873,6 +948,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX7-LABEL: flat_agent_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -884,6 +962,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -895,6 +977,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX10-CU-LABEL: flat_agent_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -917,6 +1003,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -927,6 +1015,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1007,6 +1097,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_store(
; GFX7-LABEL: flat_agent_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1019,6 +1112,10 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX10-WGP-LABEL: flat_agent_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1032,6 +1129,10 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX10-CU-LABEL: flat_agent_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1057,6 +1158,8 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1068,6 +1171,8 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1165,6 +1270,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX7-LABEL: flat_agent_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1177,6 +1285,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1190,6 +1302,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1215,6 +1331,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1226,6 +1344,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1323,6 +1443,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX7-LABEL: flat_agent_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1334,6 +1457,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1345,6 +1472,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1367,6 +1498,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1377,6 +1510,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1457,6 +1592,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX7-LABEL: flat_agent_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1470,6 +1608,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1485,6 +1627,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1512,6 +1658,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1524,6 +1672,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1622,6 +1772,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX7-LABEL: flat_agent_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1634,6 +1787,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1647,6 +1804,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1672,6 +1833,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1683,6 +1846,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1780,6 +1945,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX7-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1794,6 +1962,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1811,6 +1983,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1841,6 +2017,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1854,6 +2032,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1969,6 +2149,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX7-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1983,6 +2166,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2000,6 +2187,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2030,6 +2221,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2043,6 +2236,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2158,6 +2353,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2174,6 +2372,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2191,6 +2393,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2223,6 +2429,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2237,6 +2445,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2352,6 +2562,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2369,6 +2582,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2388,6 +2605,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2423,6 +2644,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2438,6 +2661,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2574,6 +2799,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2591,6 +2819,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2610,6 +2842,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2645,6 +2881,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2660,6 +2898,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2796,6 +3036,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2821,6 +3064,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2846,6 +3093,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2896,6 +3147,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2910,6 +3163,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3019,6 +3274,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3046,6 +3304,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3075,6 +3337,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3130,6 +3396,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3146,6 +3414,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3273,6 +3543,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3299,6 +3572,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3326,6 +3603,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3379,6 +3660,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3394,6 +3677,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3520,6 +3805,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3548,6 +3836,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3579,6 +3871,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3637,6 +3933,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3654,6 +3952,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3798,6 +4098,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3826,6 +4129,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3857,6 +4164,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3915,6 +4226,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3932,6 +4245,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4076,6 +4391,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4103,6 +4421,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4132,6 +4454,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4187,6 +4513,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4203,6 +4531,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4330,6 +4660,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4357,6 +4690,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4386,6 +4723,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4441,6 +4782,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4457,6 +4800,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4584,6 +4929,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4612,6 +4960,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4643,6 +4995,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4701,6 +5057,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4718,6 +5076,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4862,6 +5222,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4890,6 +5253,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4921,6 +5288,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4979,6 +5350,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4996,6 +5369,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5140,6 +5515,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5168,6 +5546,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5199,6 +5581,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5257,6 +5643,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5274,6 +5662,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5418,6 +5808,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5446,6 +5839,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5477,6 +5874,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5535,6 +5936,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5552,6 +5955,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5696,6 +6101,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5724,6 +6132,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5755,6 +6167,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5813,6 +6229,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5830,6 +6248,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5974,6 +6394,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6002,6 +6425,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6033,6 +6460,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6091,6 +6522,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6108,6 +6541,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6252,6 +6687,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6280,6 +6718,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6311,6 +6753,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6369,6 +6815,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6386,6 +6834,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6530,6 +6980,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6558,6 +7011,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6589,6 +7046,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6647,6 +7108,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6664,6 +7127,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6808,6 +7273,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6837,6 +7305,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6866,6 +7338,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6924,6 +7400,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6941,6 +7419,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7077,6 +7557,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7107,6 +7590,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7138,6 +7625,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7198,6 +7689,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7216,6 +7709,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7361,6 +7856,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7391,6 +7889,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7422,6 +7924,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7483,6 +7989,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7501,6 +8009,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7654,6 +8164,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7685,6 +8198,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7718,6 +8235,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7781,6 +8302,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7800,6 +8323,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7966,6 +8491,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7997,6 +8525,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8030,6 +8562,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8093,6 +8629,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8112,6 +8650,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8278,6 +8818,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8308,6 +8851,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8339,6 +8886,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8399,6 +8950,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8417,6 +8970,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8566,6 +9121,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8596,6 +9154,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8627,6 +9189,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8687,6 +9253,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8705,6 +9273,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8850,6 +9420,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8881,6 +9454,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8914,6 +9491,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8977,6 +9558,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8996,6 +9579,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9162,6 +9747,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9193,6 +9781,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9226,6 +9818,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9289,6 +9885,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9308,6 +9906,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9474,6 +10074,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9505,6 +10108,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9538,6 +10145,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9601,6 +10212,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9620,6 +10233,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9786,6 +10401,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9817,6 +10435,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9850,6 +10472,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9913,6 +10539,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9932,6 +10560,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10098,6 +10728,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10129,6 +10762,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10162,6 +10799,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10225,6 +10866,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10244,6 +10887,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10406,6 +11051,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10437,6 +11085,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10470,6 +11122,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10533,6 +11189,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10552,6 +11210,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10718,6 +11378,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10749,6 +11412,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10782,6 +11449,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10845,6 +11516,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10864,6 +11537,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11030,6 +11705,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11061,6 +11739,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11094,6 +11776,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11157,6 +11843,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11176,6 +11864,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11342,6 +12032,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX7-LABEL: flat_agent_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11356,6 +12049,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11370,6 +12067,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11398,6 +12099,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11410,6 +12113,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11509,6 +12214,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX7-LABEL: flat_agent_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11523,6 +12231,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11537,6 +12249,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11565,6 +12281,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11577,6 +12295,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11676,6 +12396,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX7-LABEL: flat_agent_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11692,6 +12415,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11709,6 +12436,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11741,6 +12472,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11755,6 +12488,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11868,6 +12603,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX7-LABEL: flat_agent_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11885,6 +12623,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11904,6 +12646,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11939,6 +12685,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11954,6 +12702,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12086,6 +12836,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX7-LABEL: flat_agent_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12097,6 +12850,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12108,6 +12865,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12130,6 +12891,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12140,6 +12903,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12220,6 +12985,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX7-LABEL: flat_agent_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12231,6 +12999,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12242,6 +13014,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12264,6 +13040,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12274,6 +13052,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12354,6 +13134,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX7-LABEL: flat_agent_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12366,6 +13149,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12379,6 +13166,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12404,6 +13195,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12415,6 +13208,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12512,6 +13307,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX7-LABEL: flat_agent_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12524,6 +13322,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12537,6 +13339,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12562,6 +13368,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12573,6 +13381,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12670,6 +13480,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12681,6 +13494,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12692,6 +13509,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12714,6 +13535,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12724,6 +13547,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12804,6 +13629,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12817,6 +13645,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12831,6 +13663,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12857,6 +13693,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12869,6 +13707,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12965,6 +13805,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12977,6 +13820,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12990,6 +13837,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13015,6 +13866,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13026,6 +13879,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13123,6 +13978,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13137,6 +13995,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13153,6 +14015,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13182,6 +14048,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13195,6 +14063,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13308,6 +14178,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13322,6 +14195,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13338,6 +14215,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13367,6 +14248,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13380,6 +14263,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13493,6 +14378,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13510,6 +14398,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13528,6 +14420,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13562,6 +14458,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13577,6 +14475,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13697,6 +14597,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13715,6 +14618,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13735,6 +14642,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13772,6 +14683,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13788,6 +14701,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13929,6 +14844,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13947,6 +14865,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13967,6 +14889,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14004,6 +14930,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14020,6 +14948,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14161,6 +15091,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14186,6 +15119,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14211,6 +15148,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14261,6 +15202,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14275,6 +15218,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14384,6 +15329,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14411,6 +15359,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14439,6 +15391,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14493,6 +15449,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14509,6 +15467,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14634,6 +15594,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14660,6 +15623,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14687,6 +15654,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14740,6 +15711,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14755,6 +15728,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14881,6 +15856,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14909,6 +15887,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14939,6 +15921,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14996,6 +15982,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15013,6 +16001,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15155,6 +16145,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15183,6 +16176,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15213,6 +16210,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15270,6 +16271,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15287,6 +16290,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15429,6 +16434,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15456,6 +16464,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15484,6 +16496,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15538,6 +16554,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15554,6 +16572,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15679,6 +16699,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15706,6 +16729,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15734,6 +16761,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15788,6 +16819,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15804,6 +16837,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15929,6 +16964,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15957,6 +16995,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15987,6 +17029,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16044,6 +17090,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16061,6 +17109,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16203,6 +17253,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16231,6 +17284,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16261,6 +17318,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16318,6 +17379,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16335,6 +17398,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16477,6 +17542,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16505,6 +17573,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16535,6 +17607,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16592,6 +17668,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16609,6 +17687,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16751,6 +17831,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16779,6 +17862,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16809,6 +17896,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16866,6 +17957,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16883,6 +17976,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17025,6 +18120,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17053,6 +18151,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17083,6 +18185,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17140,6 +18246,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17157,6 +18265,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17299,6 +18409,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17327,6 +18440,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17357,6 +18474,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17414,6 +18535,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17431,6 +18554,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17573,6 +18698,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17601,6 +18729,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17631,6 +18763,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17688,6 +18824,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17705,6 +18843,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17847,6 +18987,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17875,6 +19018,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17905,6 +19052,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17962,6 +19113,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17979,6 +19132,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18121,6 +19276,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18150,6 +19308,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18179,6 +19341,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18237,6 +19403,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18254,6 +19422,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18390,6 +19560,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18421,6 +19594,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18453,6 +19630,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18515,6 +19696,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18534,6 +19717,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18684,6 +19869,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18714,6 +19902,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18745,6 +19937,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18806,6 +20002,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18824,6 +20022,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18977,6 +20177,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19009,6 +20212,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19043,6 +20250,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19108,6 +20319,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19128,6 +20341,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19299,6 +20514,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19331,6 +20549,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19365,6 +20587,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19430,6 +20656,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19450,6 +20678,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19621,6 +20851,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19652,6 +20885,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19684,6 +20921,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19746,6 +20987,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19765,6 +21008,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19919,6 +21164,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19950,6 +21198,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19982,6 +21234,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20044,6 +21300,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20063,6 +21321,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20213,6 +21473,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20245,6 +21508,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20279,6 +21546,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20344,6 +21615,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20364,6 +21637,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20535,6 +21810,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20567,6 +21845,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20601,6 +21883,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20666,6 +21952,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20686,6 +21974,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20857,6 +22147,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20889,6 +22182,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20923,6 +22220,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20988,6 +22289,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21008,6 +22311,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21179,6 +22484,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21211,6 +22519,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21245,6 +22557,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21310,6 +22626,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21330,6 +22648,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21501,6 +22821,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21533,6 +22856,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21567,6 +22894,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21632,6 +22963,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21652,6 +22985,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21819,6 +23154,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21851,6 +23189,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21885,6 +23227,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21950,6 +23296,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21970,6 +23318,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22141,6 +23491,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22173,6 +23526,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22207,6 +23564,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22272,6 +23633,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22292,6 +23655,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22463,6 +23828,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22495,6 +23863,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22529,6 +23901,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22594,6 +23970,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22614,6 +23992,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 30c0a322d7ddc..3c24c36ec547d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-CU-LABEL: flat_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX7-LABEL: flat_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -211,6 +229,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
@@ -240,6 +262,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-CU-LABEL: flat_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
@@ -298,6 +324,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
@@ -329,6 +357,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
@@ -537,6 +567,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX7-LABEL: flat_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -551,6 +584,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -565,6 +602,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-CU-LABEL: flat_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -593,6 +634,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -605,6 +648,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -704,6 +749,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX7-LABEL: flat_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -732,6 +780,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -759,6 +811,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-CU-LABEL: flat_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -814,6 +870,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -843,6 +901,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1047,6 +1107,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX7-LABEL: flat_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1062,6 +1125,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1077,6 +1144,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1107,6 +1178,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1120,6 +1193,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index b80dfaea01653..b88a10ab24a98 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX7-LABEL: flat_singlethread_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX10-CU-LABEL: flat_singlethread_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX7-LABEL: flat_singlethread_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX7-LABEL: flat_singlethread_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -363,6 +396,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -377,6 +414,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -405,6 +446,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -417,6 +460,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -516,6 +561,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX7-LABEL: flat_singlethread_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -530,6 +578,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -544,6 +596,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -572,6 +628,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -584,6 +642,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -683,6 +743,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX7-LABEL: flat_singlethread_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -694,6 +757,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -705,6 +772,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX10-CU-LABEL: flat_singlethread_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -727,6 +798,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -737,6 +810,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -817,6 +892,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX7-LABEL: flat_singlethread_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -828,6 +906,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -839,6 +921,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -861,6 +947,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -871,6 +959,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -951,6 +1041,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_store(
; GFX7-LABEL: flat_singlethread_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -962,6 +1055,10 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX10-WGP-LABEL: flat_singlethread_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -973,6 +1070,10 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX10-CU-LABEL: flat_singlethread_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -995,6 +1096,8 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1005,6 +1108,8 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1085,6 +1190,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX7-LABEL: flat_singlethread_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1096,6 +1204,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1107,6 +1219,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1129,6 +1245,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1139,6 +1257,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1219,6 +1339,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1230,6 +1353,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1241,6 +1368,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1263,6 +1394,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1273,6 +1406,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1353,6 +1488,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1364,6 +1502,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1375,6 +1517,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1397,6 +1543,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1407,6 +1555,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1487,6 +1637,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX7-LABEL: flat_singlethread_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1498,6 +1651,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1509,6 +1666,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1531,6 +1692,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1541,6 +1704,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1621,6 +1786,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1632,6 +1800,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1643,6 +1815,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1665,6 +1841,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1675,6 +1853,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1755,6 +1935,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1766,6 +1949,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1777,6 +1964,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1799,6 +1990,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1809,6 +2002,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1889,6 +2084,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1904,6 +2102,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1919,6 +2121,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1949,6 +2155,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1962,6 +2170,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2068,6 +2278,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2083,6 +2296,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2098,6 +2315,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2128,6 +2349,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2141,6 +2364,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2247,6 +2472,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2262,6 +2490,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2277,6 +2509,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2307,6 +2543,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2320,6 +2558,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2426,6 +2666,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2451,6 +2694,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2476,6 +2723,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2526,6 +2777,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2540,6 +2793,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2649,6 +2904,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2674,6 +2932,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2699,6 +2961,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2749,6 +3015,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2763,6 +3031,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2872,6 +3142,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2897,6 +3170,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2922,6 +3199,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2972,6 +3253,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2986,6 +3269,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3095,6 +3380,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3120,6 +3408,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3145,6 +3437,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3195,6 +3491,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3209,6 +3507,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3318,6 +3618,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3343,6 +3646,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3368,6 +3675,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3418,6 +3729,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3432,6 +3745,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3541,6 +3856,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3566,6 +3884,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3591,6 +3913,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3641,6 +3967,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3655,6 +3983,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3764,6 +4094,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3789,6 +4122,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3814,6 +4151,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3864,6 +4205,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3878,6 +4221,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3987,6 +4332,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4012,6 +4360,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4037,6 +4389,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4087,6 +4443,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4101,6 +4459,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4210,6 +4570,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4235,6 +4598,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4260,6 +4627,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4310,6 +4681,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4324,6 +4697,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4433,6 +4808,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4458,6 +4836,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4483,6 +4865,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4533,6 +4919,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4547,6 +4935,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4656,6 +5046,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4681,6 +5074,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4706,6 +5103,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4756,6 +5157,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4770,6 +5173,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4879,6 +5284,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4904,6 +5312,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4929,6 +5341,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4979,6 +5395,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4993,6 +5411,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5102,6 +5522,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5127,6 +5550,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5152,6 +5579,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5202,6 +5633,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5216,6 +5649,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5325,6 +5760,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5350,6 +5788,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5375,6 +5817,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5425,6 +5871,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5439,6 +5887,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5548,6 +5998,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5573,6 +6026,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5598,6 +6055,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5648,6 +6109,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5662,6 +6125,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5771,6 +6236,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5800,6 +6268,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5829,6 +6301,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5887,6 +6363,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5904,6 +6382,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6040,6 +6520,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6069,6 +6552,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6098,6 +6585,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6156,6 +6647,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6173,6 +6666,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6309,6 +6804,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6338,6 +6836,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6367,6 +6869,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6425,6 +6931,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6442,6 +6950,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6578,6 +7088,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6607,6 +7120,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6636,6 +7153,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6694,6 +7215,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6711,6 +7234,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6847,6 +7372,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6876,6 +7404,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6905,6 +7437,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6963,6 +7499,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6980,6 +7518,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7116,6 +7656,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7145,6 +7688,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7174,6 +7721,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7232,6 +7783,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7249,6 +7802,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7385,6 +7940,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7414,6 +7972,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7443,6 +8005,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7501,6 +8067,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7518,6 +8086,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7654,6 +8224,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7683,6 +8256,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7712,6 +8289,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7770,6 +8351,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7787,6 +8370,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7923,6 +8508,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7952,6 +8540,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7981,6 +8573,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8039,6 +8635,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8056,6 +8654,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8192,6 +8792,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8221,6 +8824,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8250,6 +8857,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8308,6 +8919,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8325,6 +8938,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8461,6 +9076,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8490,6 +9108,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8519,6 +9141,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8577,6 +9203,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8594,6 +9222,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8730,6 +9360,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8759,6 +9392,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8788,6 +9425,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8846,6 +9487,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8863,6 +9506,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8999,6 +9644,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9028,6 +9676,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9057,6 +9709,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9115,6 +9771,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9132,6 +9790,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9268,6 +9928,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9297,6 +9960,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9326,6 +9993,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9384,6 +10055,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9401,6 +10074,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9537,6 +10212,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9566,6 +10244,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9595,6 +10277,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9653,6 +10339,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9670,6 +10358,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9806,6 +10496,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9820,6 +10513,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -9834,6 +10531,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -9862,6 +10563,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9874,6 +10577,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9973,6 +10678,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9987,6 +10695,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10001,6 +10713,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10029,6 +10745,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10041,6 +10759,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10140,6 +10860,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10154,6 +10877,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10168,6 +10895,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10196,6 +10927,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10208,6 +10941,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10307,6 +11042,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10321,6 +11059,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10335,6 +11077,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10363,6 +11109,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10375,6 +11123,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10474,6 +11224,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10485,6 +11238,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10496,6 +11253,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10518,6 +11279,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10528,6 +11291,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10608,6 +11373,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10619,6 +11387,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10630,6 +11402,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10652,6 +11428,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10662,6 +11440,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10742,6 +11522,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX7-LABEL: flat_singlethread_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10753,6 +11536,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10764,6 +11551,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10786,6 +11577,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10796,6 +11589,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10876,6 +11671,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10887,6 +11685,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10898,6 +11700,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10920,6 +11726,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10930,6 +11738,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11010,6 +11820,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11021,6 +11834,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11032,6 +11849,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11054,6 +11875,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11064,6 +11887,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11144,6 +11969,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11155,6 +11983,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11166,6 +11998,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11188,6 +12024,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11198,6 +12036,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11278,6 +12118,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11289,6 +12132,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11300,6 +12147,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11322,6 +12173,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11332,6 +12185,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11412,6 +12267,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11423,6 +12281,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11434,6 +12296,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11456,6 +12322,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11466,6 +12334,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11546,6 +12416,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11557,6 +12430,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11568,6 +12445,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11590,6 +12471,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11600,6 +12483,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11680,6 +12565,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11695,6 +12583,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11710,6 +12602,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11740,6 +12636,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11753,6 +12651,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11859,6 +12759,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11874,6 +12777,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11889,6 +12796,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11919,6 +12830,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11932,6 +12845,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12038,6 +12953,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12053,6 +12971,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12068,6 +12990,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12098,6 +13024,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12111,6 +13039,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12217,6 +13147,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12242,6 +13175,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12267,6 +13204,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12317,6 +13258,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12331,6 +13274,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12440,6 +13385,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12465,6 +13413,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12490,6 +13442,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12540,6 +13496,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12554,6 +13512,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12663,6 +13623,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12688,6 +13651,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12713,6 +13680,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12763,6 +13734,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12777,6 +13750,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12886,6 +13861,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12911,6 +13889,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12936,6 +13918,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12986,6 +13972,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13000,6 +13988,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13109,6 +14099,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13134,6 +14127,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13159,6 +14156,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13209,6 +14210,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13223,6 +14226,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13332,6 +14337,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13357,6 +14365,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13382,6 +14394,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13432,6 +14448,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13446,6 +14464,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13555,6 +14575,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13580,6 +14603,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13605,6 +14632,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13655,6 +14686,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13669,6 +14702,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13778,6 +14813,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13803,6 +14841,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13828,6 +14870,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13878,6 +14924,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13892,6 +14940,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14001,6 +15051,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14026,6 +15079,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14051,6 +15108,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14101,6 +15162,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14115,6 +15178,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14224,6 +15289,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14249,6 +15317,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14274,6 +15346,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14324,6 +15400,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14338,6 +15416,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14447,6 +15527,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14472,6 +15555,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14497,6 +15584,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14547,6 +15638,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14561,6 +15654,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14670,6 +15765,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14695,6 +15793,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14720,6 +15822,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14770,6 +15876,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14784,6 +15892,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14893,6 +16003,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14918,6 +16031,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14943,6 +16060,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14993,6 +16114,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15007,6 +16130,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15116,6 +16241,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15141,6 +16269,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15166,6 +16298,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15216,6 +16352,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15230,6 +16368,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15339,6 +16479,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15364,6 +16507,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15389,6 +16536,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15439,6 +16590,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15453,6 +16606,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15562,6 +16717,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15591,6 +16749,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15620,6 +16782,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15678,6 +16844,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15695,6 +16863,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15831,6 +17001,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15860,6 +17033,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15889,6 +17066,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15947,6 +17128,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15964,6 +17147,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16100,6 +17285,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16129,6 +17317,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16158,6 +17350,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16216,6 +17412,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16233,6 +17431,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16369,6 +17569,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16398,6 +17601,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16427,6 +17634,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16485,6 +17696,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16502,6 +17715,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16638,6 +17853,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16667,6 +17885,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16696,6 +17918,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16754,6 +17980,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16771,6 +17999,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16907,6 +18137,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16936,6 +18169,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16965,6 +18202,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17023,6 +18264,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17040,6 +18283,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17176,6 +18421,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17205,6 +18453,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17234,6 +18486,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17292,6 +18548,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17309,6 +18567,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17445,6 +18705,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17474,6 +18737,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17503,6 +18770,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17561,6 +18832,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17578,6 +18851,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17714,6 +18989,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17743,6 +19021,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17772,6 +19054,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17830,6 +19116,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17847,6 +19135,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17983,6 +19273,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18012,6 +19305,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18041,6 +19338,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18099,6 +19400,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18116,6 +19419,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18252,6 +19557,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18281,6 +19589,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18310,6 +19622,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18368,6 +19684,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18385,6 +19703,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18521,6 +19841,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18550,6 +19873,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18579,6 +19906,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18637,6 +19968,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18654,6 +19987,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18790,6 +20125,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18819,6 +20157,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18848,6 +20190,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18906,6 +20252,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18923,6 +20271,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19059,6 +20409,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19088,6 +20441,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19117,6 +20474,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19175,6 +20536,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19192,6 +20555,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19328,6 +20693,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19357,6 +20725,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19386,6 +20758,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19444,6 +20820,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19461,6 +20839,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 1ec942ea5f47b..919fc3e8f4e4f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_system_unordered_load(
; GFX7-LABEL: flat_system_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX10-WGP-LABEL: flat_system_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX10-CU-LABEL: flat_system_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_load(
; GFX7-LABEL: flat_system_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX10-WGP-LABEL: flat_system_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX10-CU-LABEL: flat_system_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_load(
; GFX7-LABEL: flat_system_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX10-WGP-LABEL: flat_system_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -380,6 +417,10 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX10-CU-LABEL: flat_system_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -410,6 +451,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -424,6 +467,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -533,6 +578,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX7-LABEL: flat_system_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -549,6 +597,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -567,6 +619,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX10-CU-LABEL: flat_system_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -600,6 +656,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -615,6 +673,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -743,6 +803,9 @@ entry:
define amdgpu_kernel void @flat_system_unordered_store(
; GFX7-LABEL: flat_system_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -754,6 +817,10 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX10-WGP-LABEL: flat_system_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -765,6 +832,10 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX10-CU-LABEL: flat_system_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -787,6 +858,8 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -797,6 +870,8 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -877,6 +952,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_store(
; GFX7-LABEL: flat_system_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -888,6 +966,10 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX10-WGP-LABEL: flat_system_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -899,6 +981,10 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX10-CU-LABEL: flat_system_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -921,6 +1007,8 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -931,6 +1019,8 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1011,6 +1101,9 @@ entry:
define amdgpu_kernel void @flat_system_release_store(
; GFX7-LABEL: flat_system_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1023,6 +1116,10 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX10-WGP-LABEL: flat_system_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1036,6 +1133,10 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX10-CU-LABEL: flat_system_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1061,6 +1162,8 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1073,6 +1176,8 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1173,6 +1278,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX7-LABEL: flat_system_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1185,6 +1293,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1198,6 +1310,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX10-CU-LABEL: flat_system_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1223,6 +1339,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1235,6 +1353,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1335,6 +1455,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX7-LABEL: flat_system_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1346,6 +1469,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1357,6 +1484,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1379,6 +1510,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1389,6 +1522,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1469,6 +1604,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX7-LABEL: flat_system_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1482,6 +1620,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1497,6 +1639,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1524,6 +1670,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1537,6 +1685,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1636,6 +1786,9 @@ entry:
define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX7-LABEL: flat_system_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1648,6 +1801,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1661,6 +1818,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1686,6 +1847,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1698,6 +1861,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1798,6 +1963,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX7-LABEL: flat_system_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1812,6 +1980,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1829,6 +2001,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1859,6 +2035,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1874,6 +2052,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1993,6 +2173,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX7-LABEL: flat_system_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2007,6 +2190,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2024,6 +2211,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2054,6 +2245,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2069,6 +2262,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2188,6 +2383,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2204,6 +2402,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2221,6 +2423,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2253,6 +2459,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2268,6 +2476,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2384,6 +2594,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2401,6 +2614,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2420,6 +2637,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2455,6 +2676,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2472,6 +2695,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2612,6 +2837,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2629,6 +2857,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2648,6 +2880,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2683,6 +2919,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2700,6 +2938,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2840,6 +3080,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2865,6 +3108,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2890,6 +3137,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2940,6 +3191,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2954,6 +3207,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3063,6 +3318,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3090,6 +3348,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3119,6 +3381,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3174,6 +3440,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3191,6 +3459,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3319,6 +3589,9 @@ entry:
define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3345,6 +3618,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3372,6 +3649,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3425,6 +3706,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3441,6 +3724,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3570,6 +3855,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3598,6 +3886,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3629,6 +3921,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3687,6 +3983,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3706,6 +4004,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3854,6 +4154,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3882,6 +4185,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3913,6 +4220,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3971,6 +4282,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3990,6 +4303,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4138,6 +4453,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4165,6 +4483,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4194,6 +4516,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4249,6 +4575,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4266,6 +4594,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4394,6 +4724,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4421,6 +4754,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4450,6 +4787,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4505,6 +4846,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4522,6 +4865,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4650,6 +4995,9 @@ entry:
define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX7-LABEL: flat_system_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4678,6 +5026,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4709,6 +5061,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4767,6 +5123,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4786,6 +5144,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4934,6 +5294,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4962,6 +5325,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4993,6 +5360,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5051,6 +5422,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5070,6 +5443,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5218,6 +5593,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5246,6 +5624,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5277,6 +5659,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5335,6 +5721,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5354,6 +5742,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5502,6 +5892,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5530,6 +5923,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5561,6 +5958,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5619,6 +6020,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5638,6 +6041,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5786,6 +6191,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5814,6 +6222,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5845,6 +6257,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5903,6 +6319,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5922,6 +6340,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6070,6 +6490,9 @@ entry:
define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6098,6 +6521,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6129,6 +6556,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6187,6 +6618,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6206,6 +6639,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6354,6 +6789,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6382,6 +6820,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6413,6 +6855,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6471,6 +6917,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6490,6 +6938,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6638,6 +7088,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6666,6 +7119,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6697,6 +7154,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6755,6 +7216,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6774,6 +7237,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6922,6 +7387,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6951,6 +7419,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6980,6 +7452,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7038,6 +7514,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7055,6 +7533,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7191,6 +7671,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7221,6 +7704,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7252,6 +7739,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7312,6 +7803,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7331,6 +7824,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7477,6 +7972,9 @@ entry:
define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7507,6 +8005,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7538,6 +8040,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7599,6 +8105,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7618,6 +8126,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7774,6 +8284,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7805,6 +8318,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7838,6 +8355,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7901,6 +8422,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7922,6 +8445,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8092,6 +8617,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8123,6 +8651,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8156,6 +8688,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8219,6 +8755,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8240,6 +8778,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8410,6 +8950,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8440,6 +8983,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8471,6 +9018,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8531,6 +9082,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8550,6 +9103,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8700,6 +9255,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8730,6 +9288,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8761,6 +9323,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8821,6 +9387,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8840,6 +9408,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8986,6 +9556,9 @@ entry:
define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9017,6 +9590,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9050,6 +9627,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9113,6 +9694,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9134,6 +9717,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9304,6 +9889,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9335,6 +9923,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9368,6 +9960,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9431,6 +10027,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9452,6 +10050,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9622,6 +10222,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9653,6 +10256,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9686,6 +10293,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9749,6 +10360,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9770,6 +10383,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9940,6 +10555,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9971,6 +10589,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10004,6 +10626,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10067,6 +10693,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10088,6 +10716,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10258,6 +10888,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10289,6 +10922,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10322,6 +10959,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10385,6 +11026,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10406,6 +11049,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10572,6 +11217,9 @@ entry:
define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10603,6 +11251,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10636,6 +11288,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10699,6 +11355,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10720,6 +11378,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10890,6 +11550,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10921,6 +11584,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10954,6 +11621,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11017,6 +11688,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11038,6 +11711,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11208,6 +11883,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11239,6 +11917,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11272,6 +11954,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11335,6 +12021,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11356,6 +12044,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11526,6 +12216,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX7-LABEL: flat_system_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11540,6 +12233,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11554,6 +12251,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_system_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11582,6 +12283,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11594,6 +12297,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11693,6 +12398,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX7-LABEL: flat_system_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11707,6 +12415,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11721,6 +12433,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11749,6 +12465,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11761,6 +12479,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11860,6 +12580,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX7-LABEL: flat_system_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11876,6 +12599,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11893,6 +12620,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11925,6 +12656,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11940,6 +12673,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12054,6 +12789,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX7-LABEL: flat_system_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12071,6 +12809,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12090,6 +12832,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12125,6 +12871,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12141,6 +12889,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12274,6 +13024,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX7-LABEL: flat_system_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12285,6 +13038,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12296,6 +13053,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_system_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12318,6 +13079,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12328,6 +13091,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12408,6 +13173,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX7-LABEL: flat_system_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12419,6 +13187,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12430,6 +13202,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12452,6 +13228,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12462,6 +13240,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12542,6 +13322,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX7-LABEL: flat_system_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12554,6 +13337,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12567,6 +13354,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX10-CU-LABEL: flat_system_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12592,6 +13383,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12604,6 +13397,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12704,6 +13499,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX7-LABEL: flat_system_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12716,6 +13514,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12729,6 +13531,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12754,6 +13560,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12766,6 +13574,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12866,6 +13676,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12877,6 +13690,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12888,6 +13705,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12910,6 +13731,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12920,6 +13743,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13000,6 +13825,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13013,6 +13841,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13027,6 +13859,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13053,6 +13889,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13066,6 +13904,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13163,6 +14003,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX7-LABEL: flat_system_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13175,6 +14018,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13188,6 +14035,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13213,6 +14064,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13225,6 +14078,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13325,6 +14180,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13339,6 +14197,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13355,6 +14217,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13384,6 +14250,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13399,6 +14267,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13516,6 +14386,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13530,6 +14403,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13546,6 +14423,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13575,6 +14456,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13590,6 +14473,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13707,6 +14592,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13724,6 +14612,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13742,6 +14634,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13776,6 +14672,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13792,6 +14690,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13913,6 +14813,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13931,6 +14834,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13951,6 +14858,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13988,6 +14899,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14006,6 +14919,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14151,6 +15066,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14169,6 +15087,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14189,6 +15111,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14226,6 +15152,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14244,6 +15172,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14389,6 +15319,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14414,6 +15347,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14439,6 +15376,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14489,6 +15430,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14503,6 +15446,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14612,6 +15557,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14639,6 +15587,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14667,6 +15619,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14721,6 +15677,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14738,6 +15696,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14864,6 +15824,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14890,6 +15853,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14917,6 +15884,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14970,6 +15941,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14986,6 +15959,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15115,6 +16090,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15143,6 +16121,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15173,6 +16155,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15230,6 +16216,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15249,6 +16237,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15395,6 +16385,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15423,6 +16416,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15453,6 +16450,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15510,6 +16511,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15529,6 +16532,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15675,6 +16680,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15702,6 +16710,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15730,6 +16742,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15784,6 +16800,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15801,6 +16819,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15927,6 +16947,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15954,6 +16977,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15982,6 +17009,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16036,6 +17067,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16053,6 +17086,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16179,6 +17214,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16207,6 +17245,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16237,6 +17279,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16294,6 +17340,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16313,6 +17361,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16459,6 +17509,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16487,6 +17540,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16517,6 +17574,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16574,6 +17635,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16593,6 +17656,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16739,6 +17804,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16767,6 +17835,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16797,6 +17869,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16854,6 +17930,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16873,6 +17951,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17019,6 +18099,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17047,6 +18130,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17077,6 +18164,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17134,6 +18225,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17153,6 +18246,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17299,6 +18394,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17327,6 +18425,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17357,6 +18459,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17414,6 +18520,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17433,6 +18541,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17579,6 +18689,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17607,6 +18720,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17637,6 +18754,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17694,6 +18815,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17713,6 +18836,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17859,6 +18984,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17887,6 +19015,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17917,6 +19049,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17974,6 +19110,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17993,6 +19131,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18139,6 +19279,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18167,6 +19310,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18197,6 +19344,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18254,6 +19405,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18273,6 +19426,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18419,6 +19574,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18448,6 +19606,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18477,6 +19639,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18535,6 +19701,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18552,6 +19720,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18688,6 +19858,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18719,6 +19892,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18751,6 +19928,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18813,6 +19994,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18833,6 +20016,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18984,6 +20169,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19014,6 +20202,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19045,6 +20237,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19106,6 +20302,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19125,6 +20323,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19281,6 +20481,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19313,6 +20516,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19347,6 +20554,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19412,6 +20623,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19434,6 +20647,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19609,6 +20824,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19641,6 +20859,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19675,6 +20897,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19740,6 +20966,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19762,6 +20990,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19937,6 +21167,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19968,6 +21201,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20000,6 +21237,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20062,6 +21303,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20082,6 +21325,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20237,6 +21482,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20268,6 +21516,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20300,6 +21552,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20362,6 +21618,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20382,6 +21640,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20533,6 +21793,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20565,6 +21828,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20599,6 +21866,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20664,6 +21935,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20686,6 +21959,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20861,6 +22136,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20893,6 +22171,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20927,6 +22209,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20992,6 +22278,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21014,6 +22302,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21189,6 +22479,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21221,6 +22514,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21255,6 +22552,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21320,6 +22621,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21342,6 +22645,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21517,6 +22822,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21549,6 +22857,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21583,6 +22895,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21648,6 +22964,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21670,6 +22988,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21845,6 +23165,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21877,6 +23200,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21911,6 +23238,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21976,6 +23307,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21998,6 +23331,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22169,6 +23504,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22201,6 +23539,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22235,6 +23577,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22300,6 +23646,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22322,6 +23670,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22497,6 +23847,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22529,6 +23882,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22563,6 +23920,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22628,6 +23989,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22650,6 +24013,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22825,6 +24190,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22857,6 +24225,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22891,6 +24263,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22956,6 +24332,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22978,6 +24356,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index e1f82a70b4c0a..a88e0e217fdb4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -11,6 +11,9 @@
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -26,6 +29,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -41,6 +48,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-CU-LABEL: flat_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -142,6 +153,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX7-LABEL: flat_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -172,6 +186,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
@@ -202,6 +220,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-CU-LABEL: flat_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
@@ -405,6 +427,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX7-LABEL: flat_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -420,6 +445,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -435,6 +464,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-CU-LABEL: flat_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -540,6 +573,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX7-LABEL: flat_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -569,6 +605,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -597,6 +637,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-CU-LABEL: flat_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -799,6 +843,9 @@ entry:
define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -814,6 +861,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
;
; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -829,6 +880,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
;
; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -926,6 +981,9 @@ entry:
define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX7-LABEL: flat_volatile_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -938,6 +996,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
;
; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -951,6 +1013,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
;
; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index 588f06f1be054..7c637a20ab47b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX7-LABEL: flat_wavefront_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX10-WGP-LABEL: flat_wavefront_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX10-CU-LABEL: flat_wavefront_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX7-LABEL: flat_wavefront_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX7-LABEL: flat_wavefront_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -363,6 +396,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -377,6 +414,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -405,6 +446,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -417,6 +460,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -516,6 +561,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX7-LABEL: flat_wavefront_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -530,6 +578,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -544,6 +596,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -572,6 +628,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -584,6 +642,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -683,6 +743,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX7-LABEL: flat_wavefront_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -694,6 +757,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX10-WGP-LABEL: flat_wavefront_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -705,6 +772,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX10-CU-LABEL: flat_wavefront_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -727,6 +798,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -737,6 +810,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -817,6 +892,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX7-LABEL: flat_wavefront_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -828,6 +906,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -839,6 +921,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -861,6 +947,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -871,6 +959,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -951,6 +1041,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_store(
; GFX7-LABEL: flat_wavefront_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -962,6 +1055,10 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX10-WGP-LABEL: flat_wavefront_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -973,6 +1070,10 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX10-CU-LABEL: flat_wavefront_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -995,6 +1096,8 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1005,6 +1108,8 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1085,6 +1190,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX7-LABEL: flat_wavefront_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1096,6 +1204,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1107,6 +1219,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1129,6 +1245,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1139,6 +1257,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1219,6 +1339,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1230,6 +1353,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1241,6 +1368,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1263,6 +1394,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1273,6 +1406,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1353,6 +1488,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX7-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1364,6 +1502,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1375,6 +1517,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1397,6 +1543,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1407,6 +1555,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1487,6 +1637,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX7-LABEL: flat_wavefront_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1498,6 +1651,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1509,6 +1666,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1531,6 +1692,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1541,6 +1704,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1621,6 +1786,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1632,6 +1800,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1643,6 +1815,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1665,6 +1841,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1675,6 +1853,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1755,6 +1935,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1766,6 +1949,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1777,6 +1964,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1799,6 +1990,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1809,6 +2002,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1889,6 +2084,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1904,6 +2102,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1919,6 +2121,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1949,6 +2155,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1962,6 +2170,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2068,6 +2278,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2083,6 +2296,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2098,6 +2315,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2128,6 +2349,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2141,6 +2364,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2247,6 +2472,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2262,6 +2490,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2277,6 +2509,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2307,6 +2543,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2320,6 +2558,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2426,6 +2666,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2451,6 +2694,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2476,6 +2723,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2526,6 +2777,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2540,6 +2793,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2649,6 +2904,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2674,6 +2932,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2699,6 +2961,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2749,6 +3015,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2763,6 +3031,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2872,6 +3142,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2897,6 +3170,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2922,6 +3199,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2972,6 +3253,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2986,6 +3269,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3095,6 +3380,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3120,6 +3408,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3145,6 +3437,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3195,6 +3491,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3209,6 +3507,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3318,6 +3618,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3343,6 +3646,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3368,6 +3675,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3418,6 +3729,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3432,6 +3745,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3541,6 +3856,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3566,6 +3884,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3591,6 +3913,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3641,6 +3967,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3655,6 +3983,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3764,6 +4094,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3789,6 +4122,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3814,6 +4151,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3864,6 +4205,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3878,6 +4221,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3987,6 +4332,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4012,6 +4360,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4037,6 +4389,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4087,6 +4443,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4101,6 +4459,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4210,6 +4570,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4235,6 +4598,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4260,6 +4627,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4310,6 +4681,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4324,6 +4697,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4433,6 +4808,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4458,6 +4836,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4483,6 +4865,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4533,6 +4919,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4547,6 +4935,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4656,6 +5046,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4681,6 +5074,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4706,6 +5103,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4756,6 +5157,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4770,6 +5173,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4879,6 +5284,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4904,6 +5312,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4929,6 +5341,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4979,6 +5395,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4993,6 +5411,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5102,6 +5522,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5127,6 +5550,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5152,6 +5579,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5202,6 +5633,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5216,6 +5649,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5325,6 +5760,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5350,6 +5788,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5375,6 +5817,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5425,6 +5871,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5439,6 +5887,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5548,6 +5998,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5573,6 +6026,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5598,6 +6055,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5648,6 +6109,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5662,6 +6125,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5771,6 +6236,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5800,6 +6268,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5829,6 +6301,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5887,6 +6363,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5904,6 +6382,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6040,6 +6520,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6069,6 +6552,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6098,6 +6585,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6156,6 +6647,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6173,6 +6666,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6309,6 +6804,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6338,6 +6836,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6367,6 +6869,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6425,6 +6931,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6442,6 +6950,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6578,6 +7088,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6607,6 +7120,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6636,6 +7153,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6694,6 +7215,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6711,6 +7234,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6847,6 +7372,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6876,6 +7404,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6905,6 +7437,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6963,6 +7499,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6980,6 +7518,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7116,6 +7656,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7145,6 +7688,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7174,6 +7721,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7232,6 +7783,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7249,6 +7802,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7385,6 +7940,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7414,6 +7972,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7443,6 +8005,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7501,6 +8067,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7518,6 +8086,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7654,6 +8224,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7683,6 +8256,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7712,6 +8289,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7770,6 +8351,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7787,6 +8370,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7923,6 +8508,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7952,6 +8540,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7981,6 +8573,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8039,6 +8635,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8056,6 +8654,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8192,6 +8792,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8221,6 +8824,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8250,6 +8857,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8308,6 +8919,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8325,6 +8938,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8461,6 +9076,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8490,6 +9108,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8519,6 +9141,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8577,6 +9203,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8594,6 +9222,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8730,6 +9360,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8759,6 +9392,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8788,6 +9425,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8846,6 +9487,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8863,6 +9506,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8999,6 +9644,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9028,6 +9676,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9057,6 +9709,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9115,6 +9771,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9132,6 +9790,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9268,6 +9928,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9297,6 +9960,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9326,6 +9993,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9384,6 +10055,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9401,6 +10074,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9537,6 +10212,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9566,6 +10244,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9595,6 +10277,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9653,6 +10339,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9670,6 +10358,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9806,6 +10496,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX7-LABEL: flat_wavefront_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9820,6 +10513,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -9834,6 +10531,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -9862,6 +10563,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9874,6 +10577,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9973,6 +10678,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9987,6 +10695,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10001,6 +10713,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10029,6 +10745,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10041,6 +10759,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10140,6 +10860,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX7-LABEL: flat_wavefront_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10154,6 +10877,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10168,6 +10895,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10196,6 +10927,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10208,6 +10941,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10307,6 +11042,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10321,6 +11059,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10335,6 +11077,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10363,6 +11109,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10375,6 +11123,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10474,6 +11224,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX7-LABEL: flat_wavefront_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10485,6 +11238,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10496,6 +11253,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10518,6 +11279,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10528,6 +11291,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10608,6 +11373,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10619,6 +11387,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10630,6 +11402,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10652,6 +11428,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10662,6 +11440,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10742,6 +11522,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX7-LABEL: flat_wavefront_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10753,6 +11536,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10764,6 +11551,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10786,6 +11577,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10796,6 +11589,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10876,6 +11671,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10887,6 +11685,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10898,6 +11700,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10920,6 +11726,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10930,6 +11738,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11010,6 +11820,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11021,6 +11834,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11032,6 +11849,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11054,6 +11875,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11064,6 +11887,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11144,6 +11969,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11155,6 +11983,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11166,6 +11998,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11188,6 +12024,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11198,6 +12036,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11278,6 +12118,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11289,6 +12132,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11300,6 +12147,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11322,6 +12173,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11332,6 +12185,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11412,6 +12267,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11423,6 +12281,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11434,6 +12296,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11456,6 +12322,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11466,6 +12334,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11546,6 +12416,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11557,6 +12430,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11568,6 +12445,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11590,6 +12471,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11600,6 +12483,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11680,6 +12565,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11695,6 +12583,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11710,6 +12602,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11740,6 +12636,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11753,6 +12651,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11859,6 +12759,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11874,6 +12777,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11889,6 +12796,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11919,6 +12830,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11932,6 +12845,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12038,6 +12953,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12053,6 +12971,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12068,6 +12990,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12098,6 +13024,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12111,6 +13039,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12217,6 +13147,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12242,6 +13175,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12267,6 +13204,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12317,6 +13258,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12331,6 +13274,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12440,6 +13385,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12465,6 +13413,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12490,6 +13442,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12540,6 +13496,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12554,6 +13512,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12663,6 +13623,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12688,6 +13651,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12713,6 +13680,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12763,6 +13734,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12777,6 +13750,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12886,6 +13861,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12911,6 +13889,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12936,6 +13918,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12986,6 +13972,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13000,6 +13988,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13109,6 +14099,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13134,6 +14127,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13159,6 +14156,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13209,6 +14210,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13223,6 +14226,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13332,6 +14337,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13357,6 +14365,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13382,6 +14394,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13432,6 +14448,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13446,6 +14464,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13555,6 +14575,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13580,6 +14603,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13605,6 +14632,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13655,6 +14686,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13669,6 +14702,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13778,6 +14813,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13803,6 +14841,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13828,6 +14870,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13878,6 +14924,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13892,6 +14940,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14001,6 +15051,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14026,6 +15079,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14051,6 +15108,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14101,6 +15162,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14115,6 +15178,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14224,6 +15289,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14249,6 +15317,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14274,6 +15346,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14324,6 +15400,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14338,6 +15416,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14447,6 +15527,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14472,6 +15555,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14497,6 +15584,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14547,6 +15638,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14561,6 +15654,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14670,6 +15765,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14695,6 +15793,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14720,6 +15822,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14770,6 +15876,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14784,6 +15892,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14893,6 +16003,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14918,6 +16031,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14943,6 +16060,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14993,6 +16114,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15007,6 +16130,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15116,6 +16241,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15141,6 +16269,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15166,6 +16298,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15216,6 +16352,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15230,6 +16368,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15339,6 +16479,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15364,6 +16507,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15389,6 +16536,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15439,6 +16590,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15453,6 +16606,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15562,6 +16717,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15591,6 +16749,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15620,6 +16782,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15678,6 +16844,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15695,6 +16863,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15831,6 +17001,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15860,6 +17033,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15889,6 +17066,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15947,6 +17128,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15964,6 +17147,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16100,6 +17285,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16129,6 +17317,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16158,6 +17350,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16216,6 +17412,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16233,6 +17431,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16369,6 +17569,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16398,6 +17601,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16427,6 +17634,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16485,6 +17696,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16502,6 +17715,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16638,6 +17853,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16667,6 +17885,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16696,6 +17918,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16754,6 +17980,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16771,6 +17999,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16907,6 +18137,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16936,6 +18169,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16965,6 +18202,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17023,6 +18264,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17040,6 +18283,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17176,6 +18421,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17205,6 +18453,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17234,6 +18486,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17292,6 +18548,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17309,6 +18567,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17445,6 +18705,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17474,6 +18737,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17503,6 +18770,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17561,6 +18832,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17578,6 +18851,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17714,6 +18989,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17743,6 +19021,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17772,6 +19054,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17830,6 +19116,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17847,6 +19135,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17983,6 +19273,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18012,6 +19305,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18041,6 +19338,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18099,6 +19400,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18116,6 +19419,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18252,6 +19557,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18281,6 +19589,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18310,6 +19622,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18368,6 +19684,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18385,6 +19703,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18521,6 +19841,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18550,6 +19873,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18579,6 +19906,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18637,6 +19968,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18654,6 +19987,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18790,6 +20125,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18819,6 +20157,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18848,6 +20190,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18906,6 +20252,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18923,6 +20271,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19059,6 +20409,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19088,6 +20441,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19117,6 +20474,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19175,6 +20536,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19192,6 +20555,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index ee7d79a8a8cbb..0fd4aa4a7a93f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX7-LABEL: flat_workgroup_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX10-CU-LABEL: flat_workgroup_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX7-LABEL: flat_workgroup_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX7-LABEL: flat_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -379,6 +416,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -409,6 +450,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -422,6 +465,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -528,6 +573,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX7-LABEL: flat_workgroup_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -544,6 +592,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -561,6 +613,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -593,6 +649,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -607,6 +665,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -726,6 +786,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX7-LABEL: flat_workgroup_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -737,6 +800,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -748,6 +815,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX10-CU-LABEL: flat_workgroup_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -770,6 +841,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -780,6 +853,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -860,6 +935,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX7-LABEL: flat_workgroup_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -871,6 +949,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -882,6 +964,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -904,6 +990,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -914,6 +1002,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -994,6 +1084,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_store(
; GFX7-LABEL: flat_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1006,6 +1099,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX10-WGP-LABEL: flat_workgroup_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1019,6 +1116,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX10-CU-LABEL: flat_workgroup_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1043,6 +1144,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1054,6 +1157,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1145,6 +1250,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX7-LABEL: flat_workgroup_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1157,6 +1265,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1170,6 +1282,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1194,6 +1310,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1205,6 +1323,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1296,6 +1416,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1307,6 +1430,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1318,6 +1445,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1340,6 +1471,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1350,6 +1483,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1430,6 +1565,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1442,6 +1580,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1456,6 +1598,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1480,6 +1626,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1491,6 +1639,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1583,6 +1733,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX7-LABEL: flat_workgroup_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1595,6 +1748,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1608,6 +1765,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1632,6 +1793,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1643,6 +1806,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1734,6 +1899,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1747,6 +1915,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1763,6 +1935,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1789,6 +1965,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1801,6 +1979,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1904,6 +2084,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1917,6 +2100,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1933,6 +2120,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1959,6 +2150,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1971,6 +2164,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2074,6 +2269,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2090,6 +2288,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2106,6 +2308,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2138,6 +2344,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2152,6 +2360,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2265,6 +2475,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2282,6 +2495,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2300,6 +2517,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2334,6 +2555,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2349,6 +2572,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2475,6 +2700,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2492,6 +2720,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2510,6 +2742,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2544,6 +2780,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2559,6 +2797,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2685,6 +2925,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2710,6 +2953,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2735,6 +2982,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2785,6 +3036,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2799,6 +3052,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2908,6 +3163,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2934,6 +3192,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2962,6 +3224,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3014,6 +3280,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3029,6 +3297,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3150,6 +3420,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3176,6 +3449,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3203,6 +3480,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3255,6 +3536,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3270,6 +3553,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3390,6 +3675,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3417,6 +3705,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3447,6 +3739,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3501,6 +3797,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3517,6 +3815,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3649,6 +3949,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3676,6 +3979,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3706,6 +4013,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3760,6 +4071,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3776,6 +4089,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3908,6 +4223,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3934,6 +4252,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3962,6 +4284,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4014,6 +4340,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4029,6 +4357,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4150,6 +4480,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4176,6 +4509,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4204,6 +4541,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4256,6 +4597,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4271,6 +4614,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4392,6 +4737,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4419,6 +4767,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4449,6 +4801,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4503,6 +4859,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4519,6 +4877,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4651,6 +5011,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4678,6 +5041,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4708,6 +5075,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4762,6 +5133,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4778,6 +5151,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4910,6 +5285,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4937,6 +5315,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4967,6 +5349,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5021,6 +5407,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5037,6 +5425,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5169,6 +5559,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5196,6 +5589,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5226,6 +5623,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5280,6 +5681,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5296,6 +5699,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5428,6 +5833,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5457,6 +5865,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5486,6 +5898,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5544,6 +5960,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5561,6 +5979,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5697,6 +6117,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5727,6 +6150,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5757,6 +6184,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5817,6 +6248,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5835,6 +6268,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5978,6 +6413,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6008,6 +6446,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6039,6 +6481,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6099,6 +6545,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6117,6 +6565,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6264,6 +6714,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6295,6 +6748,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6327,6 +6784,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6389,6 +6850,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6408,6 +6871,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6564,6 +7029,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6595,6 +7063,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6627,6 +7099,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6689,6 +7165,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6708,6 +7186,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6864,6 +7344,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6894,6 +7377,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6924,6 +7411,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6984,6 +7475,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7002,6 +7495,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7147,6 +7642,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7177,6 +7675,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7207,6 +7709,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7267,6 +7773,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7285,6 +7793,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7428,6 +7938,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7459,6 +7972,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7491,6 +8008,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7553,6 +8074,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7572,6 +8095,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7728,6 +8253,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7759,6 +8287,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7791,6 +8323,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7853,6 +8389,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7872,6 +8410,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8028,6 +8568,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8059,6 +8602,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8091,6 +8638,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8153,6 +8704,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8172,6 +8725,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8328,6 +8883,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8359,6 +8917,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8391,6 +8953,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8453,6 +9019,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8472,6 +9040,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8628,6 +9198,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8659,6 +9232,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8691,6 +9268,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8753,6 +9334,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8772,6 +9355,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8926,6 +9511,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8957,6 +9545,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8989,6 +9581,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9051,6 +9647,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9070,6 +9668,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9226,6 +9826,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9257,6 +9860,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9289,6 +9896,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9351,6 +9962,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9370,6 +9983,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9526,6 +10141,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9557,6 +10175,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9589,6 +10211,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9651,6 +10277,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9670,6 +10298,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9826,6 +10456,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9840,6 +10473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -9854,6 +10491,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -9882,6 +10523,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9894,6 +10537,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9993,6 +10638,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10007,6 +10655,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10021,6 +10673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10049,6 +10705,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10061,6 +10719,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10160,6 +10820,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10174,6 +10837,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10190,6 +10857,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10218,6 +10889,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10230,6 +10903,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10335,6 +11010,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10349,6 +11027,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10367,6 +11049,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10395,6 +11081,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10407,6 +11095,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10522,6 +11212,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10533,6 +11226,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10544,6 +11241,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10566,6 +11267,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10576,6 +11279,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10656,6 +11361,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10667,6 +11375,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10678,6 +11390,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10700,6 +11416,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10710,6 +11428,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10790,6 +11510,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX7-LABEL: flat_workgroup_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10801,6 +11524,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10814,6 +11541,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10836,6 +11567,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10846,6 +11579,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10934,6 +11669,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10945,6 +11683,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10958,6 +11700,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10980,6 +11726,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10990,6 +11738,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11078,6 +11828,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11089,6 +11842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11100,6 +11857,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11122,6 +11883,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11132,6 +11895,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11212,6 +11977,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11223,6 +11991,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11236,6 +12008,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11258,6 +12034,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11268,6 +12046,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11356,6 +12136,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11367,6 +12150,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11380,6 +12167,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11402,6 +12193,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11412,6 +12205,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11500,6 +12295,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11511,6 +12309,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11526,6 +12328,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11548,6 +12354,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11558,6 +12366,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11654,6 +12464,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11665,6 +12478,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11680,6 +12497,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11702,6 +12523,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11712,6 +12535,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11808,6 +12633,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11823,6 +12651,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11840,6 +12672,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11870,6 +12706,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11883,6 +12721,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11995,6 +12835,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12010,6 +12853,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12029,6 +12876,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12059,6 +12910,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12072,6 +12925,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12194,6 +13049,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12209,6 +13067,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12228,6 +13090,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12258,6 +13124,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12271,6 +13139,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12393,6 +13263,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12418,6 +13291,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12443,6 +13320,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12493,6 +13374,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12507,6 +13390,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12616,6 +13501,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12641,6 +13529,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12668,6 +13560,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12718,6 +13614,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12732,6 +13630,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12849,6 +13749,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12874,6 +13777,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12901,6 +13808,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12951,6 +13862,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12965,6 +13878,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13082,6 +13997,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13107,6 +14025,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13136,6 +14058,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13186,6 +14112,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13200,6 +14128,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13325,6 +14255,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13350,6 +14283,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13379,6 +14316,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13429,6 +14370,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13443,6 +14386,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13568,6 +14513,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13593,6 +14541,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13620,6 +14572,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13670,6 +14626,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13684,6 +14642,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13801,6 +14761,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13826,6 +14789,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13853,6 +14820,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13903,6 +14874,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13917,6 +14890,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14034,6 +15009,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14059,6 +15037,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14088,6 +15070,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14138,6 +15124,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14152,6 +15140,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14277,6 +15267,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14302,6 +15295,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14331,6 +15328,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14381,6 +15382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14395,6 +15398,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14520,6 +15525,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14545,6 +15553,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14574,6 +15586,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14624,6 +15640,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14638,6 +15656,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14763,6 +15783,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14788,6 +15811,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14817,6 +15844,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14867,6 +15898,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14881,6 +15914,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15006,6 +16041,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15031,6 +16069,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15060,6 +16102,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15110,6 +16156,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15124,6 +16172,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15249,6 +16299,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15274,6 +16327,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15303,6 +16360,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15353,6 +16414,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15367,6 +16430,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15492,6 +16557,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15517,6 +16585,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15546,6 +16618,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15596,6 +16672,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15610,6 +16688,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15735,6 +16815,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15760,6 +16843,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15789,6 +16876,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15839,6 +16930,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15853,6 +16946,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15978,6 +17073,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16007,6 +17105,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16036,6 +17138,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16094,6 +17200,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16111,6 +17219,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16247,6 +17357,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16276,6 +17389,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16307,6 +17424,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16365,6 +17486,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16382,6 +17505,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16524,6 +17649,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16553,6 +17681,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16584,6 +17716,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16642,6 +17778,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16659,6 +17797,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16803,6 +17943,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16832,6 +17975,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16865,6 +18012,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16923,6 +18074,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16940,6 +18093,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17092,6 +18247,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17121,6 +18279,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17154,6 +18316,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17212,6 +18378,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17229,6 +18397,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17381,6 +18551,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17410,6 +18583,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17441,6 +18618,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17499,6 +18680,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17516,6 +18699,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17660,6 +18845,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17689,6 +18877,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17720,6 +18912,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17778,6 +18974,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17795,6 +18993,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17937,6 +19137,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17966,6 +19169,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17999,6 +19206,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18057,6 +19268,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18074,6 +19287,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18226,6 +19441,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18255,6 +19473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18288,6 +19510,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18346,6 +19572,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18363,6 +19591,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18515,6 +19745,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18544,6 +19777,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18577,6 +19814,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18635,6 +19876,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18652,6 +19895,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18804,6 +20049,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18833,6 +20081,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18866,6 +20118,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18924,6 +20180,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18941,6 +20199,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19093,6 +20353,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19122,6 +20385,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19155,6 +20422,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19213,6 +20484,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19230,6 +20503,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19380,6 +20655,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19409,6 +20687,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19442,6 +20724,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19500,6 +20786,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19517,6 +20805,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19669,6 +20959,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19698,6 +20991,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19731,6 +21028,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19789,6 +21090,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19806,6 +21109,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19958,6 +21263,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19987,6 +21295,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20020,6 +21332,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20078,6 +21394,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20095,6 +21413,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index b9487f8e14c2b..8b600c835a160 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_agent_unordered_load(
;
; GFX7-LABEL: global_agent_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_agent_monotonic_load(
;
; GFX7-LABEL: global_agent_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -404,6 +410,9 @@ define amdgpu_kernel void @global_agent_acquire_load(
;
; GFX7-LABEL: global_agent_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -602,6 +611,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
;
; GFX7-LABEL: global_agent_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -809,6 +821,9 @@ define amdgpu_kernel void @global_agent_unordered_store(
;
; GFX7-LABEL: global_agent_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -962,6 +977,9 @@ define amdgpu_kernel void @global_agent_monotonic_store(
;
; GFX7-LABEL: global_agent_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1116,6 +1134,9 @@ define amdgpu_kernel void @global_agent_release_store(
;
; GFX7-LABEL: global_agent_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1294,6 +1315,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
;
; GFX7-LABEL: global_agent_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1470,6 +1494,9 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
;
; GFX7-LABEL: global_agent_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1623,6 +1650,9 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
;
; GFX7-LABEL: global_agent_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1802,6 +1832,9 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
;
; GFX7-LABEL: global_agent_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1980,6 +2013,9 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_agent_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2185,6 +2221,9 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_agent_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2390,6 +2429,9 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2587,6 +2629,9 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2812,6 +2857,9 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -3038,6 +3086,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3256,6 +3307,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3500,6 +3554,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3743,6 +3800,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4013,6 +4073,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4282,6 +4345,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4527,6 +4593,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4773,6 +4842,9 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5043,6 +5115,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5313,6 +5388,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5583,6 +5661,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5853,6 +5934,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6123,6 +6207,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6393,6 +6480,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6663,6 +6753,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6933,6 +7026,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7182,6 +7278,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7447,6 +7546,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7721,6 +7823,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8015,6 +8120,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8308,6 +8416,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8577,6 +8688,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8843,6 +8957,9 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9137,6 +9254,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9431,6 +9551,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9725,6 +9848,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10019,6 +10145,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10309,6 +10438,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10603,6 +10735,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10897,6 +11032,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11189,6 +11327,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
;
; GFX7-LABEL: global_agent_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11370,6 +11511,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
;
; GFX7-LABEL: global_agent_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11552,6 +11696,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
;
; GFX7-LABEL: global_agent_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11750,6 +11897,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11957,6 +12107,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
;
; GFX7-LABEL: global_agent_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12110,6 +12263,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
;
; GFX7-LABEL: global_agent_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12264,6 +12420,9 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
;
; GFX7-LABEL: global_agent_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12442,6 +12601,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12618,6 +12780,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12771,6 +12936,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12950,6 +13118,9 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13128,6 +13299,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13333,6 +13507,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13538,6 +13715,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13735,6 +13915,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13960,6 +14143,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14186,6 +14372,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14404,6 +14593,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14648,6 +14840,9 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14891,6 +15086,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15161,6 +15359,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15430,6 +15631,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15675,6 +15879,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15921,6 +16128,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16191,6 +16401,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16461,6 +16674,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16731,6 +16947,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17001,6 +17220,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17271,6 +17493,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17541,6 +17766,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17811,6 +18039,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18081,6 +18312,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18330,6 +18564,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18596,6 +18833,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18890,6 +19130,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19183,6 +19426,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19452,6 +19698,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19718,6 +19967,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20012,6 +20264,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20306,6 +20561,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20600,6 +20858,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20894,6 +21155,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21184,6 +21448,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21478,6 +21745,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21772,6 +22042,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index a6bd1b678f95e..16e55058e4fc8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -36,6 +36,9 @@ define amdgpu_kernel void @global_nontemporal_load_0(
;
; GFX7-LABEL: global_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -227,6 +230,9 @@ define amdgpu_kernel void @global_nontemporal_load_1(
;
; GFX7-LABEL: global_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -474,6 +480,9 @@ define amdgpu_kernel void @global_nontemporal_store_0(
;
; GFX7-LABEL: global_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -658,6 +667,9 @@ define amdgpu_kernel void @global_nontemporal_store_1(
;
; GFX7-LABEL: global_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -891,6 +903,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
;
; GFX7-LABEL: global_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index a5de6a92db1af..8042d38716107 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
;
; GFX7-LABEL: global_singlethread_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
;
; GFX7-LABEL: global_singlethread_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -403,6 +409,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
;
; GFX7-LABEL: global_singlethread_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -584,6 +593,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
;
; GFX7-LABEL: global_singlethread_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -758,6 +770,9 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
;
; GFX7-LABEL: global_singlethread_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -911,6 +926,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
;
; GFX7-LABEL: global_singlethread_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1064,6 +1082,9 @@ define amdgpu_kernel void @global_singlethread_release_store(
;
; GFX7-LABEL: global_singlethread_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1217,6 +1238,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
;
; GFX7-LABEL: global_singlethread_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1369,6 +1393,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
;
; GFX7-LABEL: global_singlethread_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1520,6 +1547,9 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1671,6 +1701,9 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
;
; GFX7-LABEL: global_singlethread_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1822,6 +1855,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1973,6 +2009,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2126,6 +2165,9 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2306,6 +2348,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2486,6 +2531,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2669,6 +2717,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2885,6 +2936,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3101,6 +3155,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3317,6 +3374,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3533,6 +3593,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3749,6 +3812,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3965,6 +4031,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4181,6 +4250,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4397,6 +4469,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4613,6 +4688,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4829,6 +4907,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5045,6 +5126,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5261,6 +5345,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5477,6 +5564,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5693,6 +5783,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5912,6 +6005,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6160,6 +6256,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6408,6 +6507,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6656,6 +6758,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6904,6 +7009,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7152,6 +7260,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7400,6 +7511,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7648,6 +7762,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7896,6 +8013,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8144,6 +8264,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8392,6 +8515,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8640,6 +8766,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8888,6 +9017,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9136,6 +9268,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9384,6 +9519,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9632,6 +9770,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
;
; GFX7-LABEL: global_singlethread_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9813,6 +9954,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9994,6 +10138,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10175,6 +10322,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10349,6 +10499,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
;
; GFX7-LABEL: global_singlethread_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10502,6 +10655,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10655,6 +10811,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
;
; GFX7-LABEL: global_singlethread_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10808,6 +10967,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10960,6 +11122,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11111,6 +11276,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11262,6 +11430,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11413,6 +11584,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11564,6 +11738,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11717,6 +11894,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11897,6 +12077,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12077,6 +12260,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12260,6 +12446,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12476,6 +12665,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12692,6 +12884,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12908,6 +13103,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13124,6 +13322,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13340,6 +13541,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13556,6 +13760,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13772,6 +13979,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13988,6 +14198,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14204,6 +14417,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14420,6 +14636,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14636,6 +14855,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14852,6 +15074,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15068,6 +15293,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15284,6 +15512,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15503,6 +15734,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15751,6 +15985,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15999,6 +16236,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16247,6 +16487,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16495,6 +16738,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16743,6 +16989,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16991,6 +17240,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17239,6 +17491,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17487,6 +17742,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17735,6 +17993,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17983,6 +18244,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18231,6 +18495,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18479,6 +18746,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18727,6 +18997,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18975,6 +19248,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 69404247ccd6e..9c11781da56f2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_system_unordered_load(
;
; GFX7-LABEL: global_system_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_system_monotonic_load(
;
; GFX7-LABEL: global_system_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -404,6 +410,9 @@ define amdgpu_kernel void @global_system_acquire_load(
;
; GFX7-LABEL: global_system_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -604,6 +613,9 @@ define amdgpu_kernel void @global_system_seq_cst_load(
;
; GFX7-LABEL: global_system_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -813,6 +825,9 @@ define amdgpu_kernel void @global_system_unordered_store(
;
; GFX7-LABEL: global_system_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -966,6 +981,9 @@ define amdgpu_kernel void @global_system_monotonic_store(
;
; GFX7-LABEL: global_system_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1120,6 +1138,9 @@ define amdgpu_kernel void @global_system_release_store(
;
; GFX7-LABEL: global_system_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1302,6 +1323,9 @@ define amdgpu_kernel void @global_system_seq_cst_store(
;
; GFX7-LABEL: global_system_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1482,6 +1506,9 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
;
; GFX7-LABEL: global_system_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1635,6 +1662,9 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
;
; GFX7-LABEL: global_system_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1816,6 +1846,9 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
;
; GFX7-LABEL: global_system_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1998,6 +2031,9 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_system_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2209,6 +2245,9 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_system_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2420,6 +2459,9 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2619,6 +2661,9 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2850,6 +2895,9 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -3082,6 +3130,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3300,6 +3351,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3546,6 +3600,9 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3793,6 +3850,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4069,6 +4129,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4344,6 +4407,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4591,6 +4657,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4839,6 +4908,9 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5115,6 +5187,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5391,6 +5466,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5667,6 +5745,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5943,6 +6024,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6192,6 +6276,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6460,6 +6547,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6760,6 +6850,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7059,6 +7152,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7330,6 +7426,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7598,6 +7697,9 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7898,6 +8000,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8198,6 +8303,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8498,6 +8606,9 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8798,6 +8909,9 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9094,6 +9208,9 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9394,6 +9511,9 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9694,6 +9814,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9992,6 +10115,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
;
; GFX7-LABEL: global_system_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10173,6 +10299,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
;
; GFX7-LABEL: global_system_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10355,6 +10484,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
;
; GFX7-LABEL: global_system_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10555,6 +10687,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
;
; GFX7-LABEL: global_system_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10764,6 +10899,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
;
; GFX7-LABEL: global_system_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10917,6 +11055,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
;
; GFX7-LABEL: global_system_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11071,6 +11212,9 @@ define amdgpu_kernel void @global_system_one_as_release_store(
;
; GFX7-LABEL: global_system_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11253,6 +11397,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
;
; GFX7-LABEL: global_system_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11433,6 +11580,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11586,6 +11736,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11767,6 +11920,9 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11949,6 +12105,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12160,6 +12319,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12371,6 +12533,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12570,6 +12735,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12801,6 +12969,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13033,6 +13204,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13251,6 +13425,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13497,6 +13674,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13744,6 +13924,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14020,6 +14203,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14295,6 +14481,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14542,6 +14731,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14790,6 +14982,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15066,6 +15261,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15342,6 +15540,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15618,6 +15819,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15894,6 +16098,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16170,6 +16377,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16446,6 +16656,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16722,6 +16935,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16998,6 +17214,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17247,6 +17466,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17514,6 +17736,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17792,6 +18017,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18092,6 +18320,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18391,6 +18622,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18662,6 +18896,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18930,6 +19167,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19230,6 +19470,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19530,6 +19773,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19830,6 +20076,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20130,6 +20379,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20426,6 +20678,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20726,6 +20981,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21026,6 +21284,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 7dfd5e60c24f8..8a5c5dda9f79c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -37,6 +37,9 @@ define amdgpu_kernel void @global_volatile_load_0(
;
; GFX7-LABEL: global_volatile_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -184,6 +187,9 @@ define amdgpu_kernel void @global_volatile_load_1(
;
; GFX7-LABEL: global_volatile_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -372,6 +378,9 @@ define amdgpu_kernel void @global_volatile_store_0(
;
; GFX7-LABEL: global_volatile_store_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -527,6 +536,9 @@ define amdgpu_kernel void @global_volatile_store_1(
;
; GFX7-LABEL: global_volatile_store_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -718,6 +730,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
;
; GFX7-LABEL: global_volatile_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -852,6 +867,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
;
; GFX7-LABEL: global_volatile_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index 4b6c99282dc13..151ba07a0b531 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
;
; GFX7-LABEL: global_wavefront_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
;
; GFX7-LABEL: global_wavefront_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -403,6 +409,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
;
; GFX7-LABEL: global_wavefront_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -584,6 +593,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
;
; GFX7-LABEL: global_wavefront_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -758,6 +770,9 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
;
; GFX7-LABEL: global_wavefront_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -911,6 +926,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
;
; GFX7-LABEL: global_wavefront_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1064,6 +1082,9 @@ define amdgpu_kernel void @global_wavefront_release_store(
;
; GFX7-LABEL: global_wavefront_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1217,6 +1238,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
;
; GFX7-LABEL: global_wavefront_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1369,6 +1393,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
;
; GFX7-LABEL: global_wavefront_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1520,6 +1547,9 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1671,6 +1701,9 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
;
; GFX7-LABEL: global_wavefront_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1822,6 +1855,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1973,6 +2009,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2126,6 +2165,9 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2306,6 +2348,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2486,6 +2531,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2669,6 +2717,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2885,6 +2936,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3101,6 +3155,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3317,6 +3374,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3533,6 +3593,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3749,6 +3812,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3965,6 +4031,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4181,6 +4250,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4397,6 +4469,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4613,6 +4688,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4829,6 +4907,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5045,6 +5126,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5261,6 +5345,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5477,6 +5564,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5693,6 +5783,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5912,6 +6005,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6160,6 +6256,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6408,6 +6507,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6656,6 +6758,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6904,6 +7009,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7152,6 +7260,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7400,6 +7511,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7648,6 +7762,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7896,6 +8013,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8144,6 +8264,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8392,6 +8515,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8640,6 +8766,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8888,6 +9017,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9136,6 +9268,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9384,6 +9519,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9632,6 +9770,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
;
; GFX7-LABEL: global_wavefront_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9813,6 +9954,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9994,6 +10138,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10175,6 +10322,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10349,6 +10499,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
;
; GFX7-LABEL: global_wavefront_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10502,6 +10655,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10655,6 +10811,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
;
; GFX7-LABEL: global_wavefront_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10808,6 +10967,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10960,6 +11122,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11111,6 +11276,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11262,6 +11430,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11413,6 +11584,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11564,6 +11738,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11717,6 +11894,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11897,6 +12077,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12077,6 +12260,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12260,6 +12446,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12476,6 +12665,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12692,6 +12884,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12908,6 +13103,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13124,6 +13322,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13340,6 +13541,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13556,6 +13760,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13772,6 +13979,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13988,6 +14198,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14204,6 +14417,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14420,6 +14636,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14636,6 +14855,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14852,6 +15074,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15068,6 +15293,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15284,6 +15512,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15503,6 +15734,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15751,6 +15985,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15999,6 +16236,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16247,6 +16487,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16495,6 +16738,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16743,6 +16989,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16991,6 +17240,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17239,6 +17491,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17487,6 +17742,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17735,6 +17993,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17983,6 +18244,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18231,6 +18495,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18479,6 +18746,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18727,6 +18997,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18975,6 +19248,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 46d65187cb1b2..69b0c7f93ab0e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
;
; GFX7-LABEL: global_workgroup_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
;
; GFX7-LABEL: global_workgroup_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -403,6 +409,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
;
; GFX7-LABEL: global_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -590,6 +599,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
;
; GFX7-LABEL: global_workgroup_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -780,6 +792,9 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
;
; GFX7-LABEL: global_workgroup_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -933,6 +948,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
;
; GFX7-LABEL: global_workgroup_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1087,6 +1105,9 @@ define amdgpu_kernel void @global_workgroup_release_store(
;
; GFX7-LABEL: global_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1258,6 +1279,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
;
; GFX7-LABEL: global_workgroup_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1427,6 +1451,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
;
; GFX7-LABEL: global_workgroup_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1578,6 +1605,9 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1740,6 +1770,9 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
;
; GFX7-LABEL: global_workgroup_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1909,6 +1942,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2088,6 +2124,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2268,6 +2307,9 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2454,6 +2496,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2659,6 +2704,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2866,6 +2914,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3082,6 +3133,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3309,6 +3363,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3543,6 +3600,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3787,6 +3847,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4030,6 +4093,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4256,6 +4322,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4483,6 +4552,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4727,6 +4799,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4971,6 +5046,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5215,6 +5293,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5459,6 +5540,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5703,6 +5787,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5947,6 +6034,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6191,6 +6281,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6437,6 +6530,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6685,6 +6781,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6939,6 +7038,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7205,6 +7307,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7478,6 +7583,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7750,6 +7858,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8005,6 +8116,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8259,6 +8373,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8532,6 +8649,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8805,6 +8925,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9078,6 +9201,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9351,6 +9477,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9622,6 +9751,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9895,6 +10027,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10168,6 +10303,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10440,6 +10578,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
;
; GFX7-LABEL: global_workgroup_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10621,6 +10762,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10802,6 +10946,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10988,6 +11135,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11175,6 +11325,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
;
; GFX7-LABEL: global_workgroup_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11328,6 +11481,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11481,6 +11637,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
;
; GFX7-LABEL: global_workgroup_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11644,6 +11803,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11806,6 +11968,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11957,6 +12122,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12118,6 +12286,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12279,6 +12450,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12450,6 +12624,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12623,6 +12800,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12808,6 +12988,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13005,6 +13188,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13205,6 +13391,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13421,6 +13610,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13647,6 +13839,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13873,6 +14068,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14109,6 +14307,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14345,6 +14546,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14571,6 +14775,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14797,6 +15004,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15033,6 +15243,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15269,6 +15482,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15505,6 +15721,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15741,6 +15960,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15977,6 +16199,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16213,6 +16438,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16449,6 +16677,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16688,6 +16919,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16936,6 +17170,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17189,6 +17426,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17447,6 +17687,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17712,6 +17955,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17977,6 +18223,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18232,6 +18481,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18485,6 +18737,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18750,6 +19005,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19015,6 +19273,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19280,6 +19541,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19545,6 +19809,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19808,6 +20075,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20073,6 +20343,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20338,6 +20611,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 04b0f00fe77b5..78209ee34cad4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -38,6 +38,9 @@ define amdgpu_kernel void @local_nontemporal_load_0(
;
; GFX7-LABEL: local_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
@@ -224,6 +227,9 @@ define amdgpu_kernel void @local_nontemporal_load_1(
;
; GFX7-LABEL: local_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 s7, 2
@@ -830,6 +836,9 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
;
; GFX7-LABEL: local_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 9e5f5fcffca9f..bc2508411ed6b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -34,6 +34,9 @@ define amdgpu_kernel void @local_volatile_load_0(
;
; GFX7-LABEL: local_volatile_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
@@ -172,6 +175,9 @@ define amdgpu_kernel void @local_volatile_load_1(
;
; GFX7-LABEL: local_volatile_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 s7, 2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index fceee413f3f97..2aa4f021c259c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -38,7 +38,10 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX7-LABEL: private_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -53,7 +56,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX10-WGP-LABEL: private_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -67,7 +70,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX10-CU-LABEL: private_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -107,7 +110,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -121,7 +124,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -232,7 +235,10 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX7-LABEL: private_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -249,7 +255,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX10-WGP-LABEL: private_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -265,7 +271,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX10-CU-LABEL: private_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -309,7 +315,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -328,7 +334,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -470,7 +476,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX7-LABEL: private_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
@@ -484,7 +490,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX10-WGP-LABEL: private_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -498,7 +504,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX10-CU-LABEL: private_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -530,7 +536,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -544,7 +550,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -647,7 +653,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX7-LABEL: private_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
@@ -663,7 +669,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX10-WGP-LABEL: private_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -678,7 +684,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX10-CU-LABEL: private_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -713,7 +719,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -731,7 +737,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -874,7 +880,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX7-LABEL: private_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -889,7 +898,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -903,7 +912,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX10-CU-LABEL: private_nontemporal_volatile_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -943,7 +952,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -957,7 +966,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index f8fb7986938f2..df4193969f8a0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -38,7 +38,10 @@ define amdgpu_kernel void @private_volatile_load_0(
;
; GFX7-LABEL: private_volatile_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -53,7 +56,7 @@ define amdgpu_kernel void @private_volatile_load_0(
;
; GFX10-WGP-LABEL: private_volatile_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -67,7 +70,7 @@ define amdgpu_kernel void @private_volatile_load_0(
;
; GFX10-CU-LABEL: private_volatile_load_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -190,7 +193,10 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX7-LABEL: private_volatile_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -207,7 +213,7 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX10-WGP-LABEL: private_volatile_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -223,7 +229,7 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX10-CU-LABEL: private_volatile_load_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -365,7 +371,7 @@ define amdgpu_kernel void @private_volatile_store_0(
;
; GFX7-LABEL: private_volatile_store_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
@@ -380,7 +386,7 @@ define amdgpu_kernel void @private_volatile_store_0(
;
; GFX10-WGP-LABEL: private_volatile_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -395,7 +401,7 @@ define amdgpu_kernel void @private_volatile_store_0(
;
; GFX10-CU-LABEL: private_volatile_store_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -515,7 +521,7 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX7-LABEL: private_volatile_store_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
@@ -532,7 +538,7 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX10-WGP-LABEL: private_volatile_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -548,7 +554,7 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX10-CU-LABEL: private_volatile_store_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index aaf81e2fa4000..07072f6a36296 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -34,10 +34,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -56,10 +59,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -144,6 +150,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_imin_sle_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -155,6 +164,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_imin_sle_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -214,6 +226,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
; CI-LABEL: s_test_imin_sle_v1i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -225,6 +240,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
; VI-LABEL: s_test_imin_sle_v1i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -288,6 +306,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s3, s3, s7
; CI-NEXT: s_min_i32 s2, s2, s6
@@ -306,6 +327,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s3, s3, s7
; VI-NEXT: s_min_i32 s2, s2, s6
@@ -414,11 +438,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i8 s2, s2
; CI-NEXT: s_sext_i32_i8 s3, s3
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_byte v[0:1], v2
@@ -429,11 +456,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i8 s2, s2
; VI-NEXT: s_sext_i32_i8 s3, s3
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -549,6 +579,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s4, s2, 24
; CI-NEXT: s_sext_i32_i8 s5, s2
@@ -572,6 +604,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; CI-NEXT: s_and_b32 s3, s3, 0xffff
; CI-NEXT: s_or_b32 s2, s3, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -582,6 +615,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s4, s2, 24
; VI-NEXT: s_bfe_i32 s5, s2, 0x80010
@@ -605,6 +640,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_or_b32 s2, s2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -757,6 +793,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
; CI-LABEL: s_test_imin_sle_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s4, s2, 16
; CI-NEXT: s_sext_i32_i16 s2, s2
@@ -776,6 +815,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
; VI-LABEL: s_test_imin_sle_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s4, s2, 16
; VI-NEXT: s_sext_i32_i16 s2, s2
@@ -857,6 +899,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s6, s0, 16
; CI-NEXT: s_ashr_i32 s7, s1, 16
@@ -887,6 +932,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s6, s1, 16
; VI-NEXT: s_sext_i32_i16 s1, s1
@@ -983,10 +1031,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1005,10 +1056,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1122,10 +1176,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1144,10 +1201,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1233,6 +1293,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_imin_slt_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1244,6 +1307,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_imin_slt_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1305,6 +1371,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s1, s1, s3
; CI-NEXT: s_min_i32 s0, s0, s2
@@ -1319,6 +1388,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s1, s1, s3
; VI-NEXT: s_min_i32 s0, s0, s2
@@ -1391,6 +1463,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, 8
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1403,6 +1478,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1468,6 +1546,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, 8
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1480,6 +1561,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1557,10 +1641,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1579,10 +1666,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1686,12 +1776,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v2, s5
; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
@@ -1710,12 +1803,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
@@ -1838,12 +1934,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1874,12 +1973,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1976,6 +2078,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_umin_ule_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1987,6 +2092,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_umin_ule_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2059,10 +2167,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2081,10 +2192,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2188,6 +2302,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s3
; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
@@ -2209,6 +2326,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0
@@ -2294,6 +2414,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_umin_ult_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2305,6 +2428,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_umin_ult_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2386,6 +2512,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
; CI-LABEL: v_test_umin_ult_i32_multi_use:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-NEXT: s_load_dword s5, s[6:7], 0x0
@@ -2407,6 +2536,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
; VI-LABEL: v_test_umin_ult_i32_multi_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s4, s[4:5], 0x0
; VI-NEXT: s_load_dword s5, s[6:7], 0x0
@@ -2534,6 +2666,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; CI-LABEL: v_test_umin_ult_i16_multi_use:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
@@ -2556,6 +2691,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; VI-LABEL: v_test_umin_ult_i16_multi_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
@@ -2646,6 +2784,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
; CI-LABEL: s_test_umin_ult_v1i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2657,6 +2798,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
; VI-LABEL: s_test_umin_ult_v1i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2726,6 +2870,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
;
; CI-LABEL: s_test_umin_ult_v8i32:
; CI: ; %bb.0:
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -2757,6 +2904,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
;
; VI-LABEL: s_test_umin_ult_v8i32:
; VI: ; %bb.0:
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -2921,6 +3071,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s10, s0, 16
; CI-NEXT: s_and_b32 s0, s0, 0xffff
@@ -2967,6 +3120,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s10, s3, 16
; VI-NEXT: s_and_b32 s3, s3, 0xffff
@@ -3088,11 +3244,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0xffff
; CI-NEXT: s_and_b32 s3, s3, 0xffff
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -3103,11 +3262,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -3195,11 +3357,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i16 s2, s2
; CI-NEXT: s_sext_i32_i16 s3, s3
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -3210,11 +3375,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: s_sext_i32_i16 s3, s3
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -3309,6 +3477,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i16 s3, s2
; CI-NEXT: s_ashr_i32 s2, s2, 16
@@ -3323,6 +3494,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s3, s2
; VI-NEXT: s_ashr_i32 s2, s2, 16
@@ -3403,6 +3577,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3421,6 +3598,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3510,6 +3690,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3528,6 +3711,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3617,6 +3803,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3635,6 +3824,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3724,6 +3916,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3742,6 +3937,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3855,9 +4053,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_load_dword v4, v[0:1]
@@ -3886,10 +4087,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -4005,9 +4209,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_load_dword v4, v[0:1]
@@ -4035,10 +4242,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index 337320b9eeea1..b1ce5a3423f20 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -180,6 +180,9 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -260,6 +263,9 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -341,6 +347,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -403,6 +412,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -465,6 +477,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -527,6 +542,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -588,6 +606,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index bc1710686a087..5803821a1d2c0 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -176,6 +176,9 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1)
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -254,6 +257,9 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -333,6 +339,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 {
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -393,6 +402,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -454,6 +466,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 {
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -514,6 +529,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
index 2e9f09ad41813..7c9ecc892478c 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
; Check that no attributes are added to graphics functions
-; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-annotate-kernel-features %s | FileCheck -check-prefixes=AKF_GCN %s
; RUN: opt -S -mtriple=amdgcn-amd-amdpal -passes=amdgpu-attributor %s | FileCheck -check-prefixes=ATTRIBUTOR_GCN %s
; Check that it doesn't crash
@@ -12,12 +11,6 @@ target datalayout = "A5"
define amdgpu_cs void @test_simple_indirect_call() {
-; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call() {
-; AKF_GCN-NEXT: [[PC:%.*]] = call i64 @llvm.amdgcn.s.getpc()
-; AKF_GCN-NEXT: [[FUN:%.*]] = inttoptr i64 [[PC]] to ptr
-; AKF_GCN-NEXT: call amdgpu_gfx void [[FUN]]()
-; AKF_GCN-NEXT: ret void
-;
; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] {
; ATTRIBUTOR_GCN-NEXT: [[PC:%.*]] = call i64 @llvm.amdgcn.s.getpc()
@@ -68,7 +61,6 @@ declare i64 @llvm.amdgcn.s.getpc() #0
attributes #0 = { nounwind readnone speculatable willreturn }
;.
-; AKF_GCN: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" }
; ATTRIBUTOR_GCN: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 106824a085b42..a75c04e435487 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -26,16 +26,16 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
;
; PEI-GFX908-LABEL: name: partial_copy
; PEI-GFX908: bb.0 (%ir-block.0):
- ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
+ ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
; PEI-GFX908-NEXT: {{ $}}
- ; PEI-GFX908-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
- ; PEI-GFX908-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
- ; PEI-GFX908-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+ ; PEI-GFX908-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+ ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+ ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
- ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+ ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
@@ -44,7 +44,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+ ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -70,16 +70,16 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
;
; PEI-GFX90A-LABEL: name: partial_copy
; PEI-GFX90A: bb.0 (%ir-block.0):
- ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
+ ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
; PEI-GFX90A-NEXT: {{ $}}
- ; PEI-GFX90A-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
- ; PEI-GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
- ; PEI-GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+ ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+ ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+ ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
- ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+ ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -87,7 +87,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+ ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index 00507c1eafd6e..c26f0926d86b2 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -19,16 +19,16 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preload_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB0_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB0_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -54,17 +54,16 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: preload_unused_arg_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB1_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB1_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s12
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -90,7 +89,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: no_free_sgprs_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB2_0
; GFX90a-NEXT: .p2align 8
@@ -100,7 +99,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[12:13]
+; GFX90a-NEXT: global_store_dword v0, v1, s[14:15]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -181,7 +180,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: incorrect_type_i64_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB5_0
; GFX90a-NEXT: .p2align 8
@@ -191,7 +190,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i64, ptr addrspace(4) %imp_arg_ptr
@@ -217,7 +216,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: incorrect_type_i16_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB6_0
; GFX90a-NEXT: .p2align 8
@@ -227,7 +226,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i16, ptr addrspace(4) %imp_arg_ptr
@@ -252,16 +251,15 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preload_block_count_y:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB7_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB7_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
@@ -289,7 +287,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: random_incorrect_offset:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB8_0
; GFX90a-NEXT: .p2align 8
@@ -300,7 +298,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
@@ -327,17 +325,16 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preload_block_count_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB9_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB9_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s12
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
@@ -366,19 +363,18 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
;
; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB10_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB10_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
-; GFX90a-NEXT: s_add_i32 s0, s10, s0
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
+; GFX90a-NEXT: s_add_i32 s0, s12, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -408,19 +404,18 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_block_count_xyz:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB11_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB11_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
@@ -454,17 +449,17 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_workgroup_size_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB12_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB12_0:
-; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
@@ -492,17 +487,17 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_workgroup_size_y:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB13_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB13_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
+; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
@@ -531,18 +526,18 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_workgroup_size_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB14_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB14_0:
-; GFX90a-NEXT: s_and_b32 s0, s12, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s14, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
@@ -575,22 +570,22 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
;
; GFX90a-LABEL: preload_workgroup_size_xyz:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB15_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB15_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
-; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff
-; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-NEXT: s_and_b32 s1, s13, 0xffff
+; GFX90a-NEXT: s_and_b32 s2, s14, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
@@ -628,18 +623,18 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-LABEL: preload_remainder_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB16_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB16_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s12, 16
+; GFX90a-NEXT: s_lshr_b32 s0, s14, 16
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
@@ -668,18 +663,16 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-LABEL: preloadremainder_y:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB17_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB17_0:
-; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s15, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
@@ -708,18 +701,16 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-LABEL: preloadremainder_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB18_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB18_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-NEXT: s_lshr_b32 s0, s15, 16
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
@@ -752,22 +743,20 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preloadremainder_xyz:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB19_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB19_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT: s_lshr_b32 s1, s12, 16
-; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s15, 16
+; GFX90a-NEXT: s_lshr_b32 s1, s14, 16
+; GFX90a-NEXT: s_and_b32 s2, s15, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
@@ -803,7 +792,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
;
; GFX90a-LABEL: no_free_sgprs_preloadremainder_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB20_0
; GFX90a-NEXT: .p2align 8
@@ -814,7 +803,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_lshr_b32 s0, s0, 16
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[12:13]
+; GFX90a-NEXT: global_store_dword v0, v1, s[14:15]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
@@ -844,10 +833,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
;
; GFX90a-LABEL: preload_block_max_user_sgprs:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB21_0
; GFX90a-NEXT: .p2align 8
@@ -857,7 +843,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -887,21 +873,23 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt
;
; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB22_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB22_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff
+; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x1c
+; GFX90a-NEXT: s_and_b32 s1, s14, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: s_lshr_b32 s0, s0, 16
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index fe6378435a42e..7ae0c11dca279 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -21,17 +21,17 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0)
;
; GFX90a-LABEL: ptr1_i8:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB0_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB0_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
@@ -56,17 +56,17 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero
;
; GFX90a-LABEL: ptr1_i8_zext_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB1_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB1_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -91,17 +91,17 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16
;
; GFX90a-LABEL: ptr1_i16_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB2_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB2_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -125,16 +125,16 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32
;
; GFX90a-LABEL: ptr1_i32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB3_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB3_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store i32 %arg0, ptr addrspace(1) %out
ret void
@@ -160,18 +160,17 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa
;
; GFX90a-LABEL: i32_ptr1_i32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB4_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB4_0:
-; GFX90a-NEXT: s_add_i32 s0, s6, s10
+; GFX90a-NEXT: s_add_i32 s0, s8, s12
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[10:11]
; GFX90a-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
@@ -198,19 +197,19 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: ptr1_i16_i16_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB5_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB5_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s8, 16
-; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s10, 16
+; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff
; GFX90a-NEXT: s_add_i32 s0, s1, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
@@ -236,16 +235,16 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2
;
; GFX90a-LABEL: ptr1_v2i8_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB6_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB6_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <2 x i8> %in, ptr addrspace(1) %out
ret void
@@ -274,7 +273,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
;
; GFX90a-LABEL: byref_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB7_0
; GFX90a-NEXT: .p2align 8
@@ -285,9 +284,9 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -320,7 +319,7 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: byref_staggered_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB8_0
; GFX90a-NEXT: .p2align 8
@@ -331,9 +330,9 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -370,26 +369,26 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x
;
; GFX90a-LABEL: v8i32_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB9_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB9_0:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v0, s16
+; GFX90a-NEXT: v_mov_b32_e32 v1, s17
+; GFX90a-NEXT: v_mov_b32_e32 v2, s18
+; GFX90a-NEXT: v_mov_b32_e32 v3, s19
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
+; GFX90a-NEXT: s_nop 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-NEXT: s_nop 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-NEXT: s_endpgm
store <8 x i32> %in, ptr addrspace(1) %out, align 4
ret void
@@ -414,18 +413,17 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-LABEL: v3i16_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB10_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB10_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <3 x i16> %in, ptr addrspace(1) %out, align 4
ret void
@@ -451,19 +449,17 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-LABEL: v3i32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB11_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB11_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
store <3 x i32> %in, ptr addrspace(1) %out, align 4
ret void
@@ -489,19 +485,17 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-LABEL: v3f32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB12_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB12_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
store <3 x float> %in, ptr addrspace(1) %out, align 4
ret void
@@ -533,25 +527,24 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou
;
; GFX90a-LABEL: v5i8_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB13_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB13_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s8, 24
+; GFX90a-NEXT: s_lshr_b32 s1, s10, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s0, s0, s1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_byte v0, v1, s[6:7] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_byte v0, v1, s[8:9] offset:4
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <5 x i8> %in, ptr addrspace(1) %out, align 4
ret void
@@ -587,29 +580,29 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
;
; GFX90a-LABEL: v5f64_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB14_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB14_0:
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x40
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s16
+; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] offset:32
+; GFX90a-NEXT: v_mov_b32_e32 v1, s17
+; GFX90a-NEXT: v_mov_b32_e32 v2, s18
+; GFX90a-NEXT: v_mov_b32_e32 v3, s19
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
+; GFX90a-NEXT: s_nop 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-NEXT: s_nop 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-NEXT: s_endpgm
store <5 x double> %in, ptr addrspace(1) %out, align 8
ret void
@@ -647,31 +640,30 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8
;
; GFX90a-LABEL: v8i8_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB15_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB15_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s9, 24
+; GFX90a-NEXT: s_lshr_b32 s1, s11, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s9, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s2, s11, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_lshr_b32 s2, s8, 24
+; GFX90a-NEXT: s_lshr_b32 s2, s10, 24
; GFX90a-NEXT: s_lshl_b32 s2, s2, 8
-; GFX90a-NEXT: s_bfe_u32 s3, s8, 0x80010
-; GFX90a-NEXT: s_and_b32 s0, s9, 0xffff
+; GFX90a-NEXT: s_bfe_u32 s3, s10, 0x80010
+; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s2, s3, s2
; GFX90a-NEXT: s_or_b32 s0, s0, s1
-; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff
; GFX90a-NEXT: s_lshl_b32 s2, s2, 16
; GFX90a-NEXT: s_or_b32 s1, s1, s2
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
store <8 x i8> %in, ptr addrspace(1) %out
ret void
@@ -694,16 +686,15 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i
;
; GFX90a-LABEL: i64_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB16_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB16_0:
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
store i64 %a, ptr addrspace(1) %out, align 8
ret void
@@ -726,16 +717,15 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d
;
; GFX90a-LABEL: f64_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB17_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB17_0:
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
store double %in, ptr addrspace(1) %out
ret void
@@ -758,16 +748,16 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: half_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB18_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB18_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
ret void
@@ -790,16 +780,16 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out
;
; GFX90a-LABEL: bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB19_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB19_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store bfloat %in, ptr addrspace(1) %out
ret void
@@ -822,16 +812,16 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: v2bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB20_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB20_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <2 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -856,18 +846,17 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: v3bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB21_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB21_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <3 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -893,19 +882,17 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: v6bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB22_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB22_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
store <6 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -934,24 +921,24 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr
;
; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB23_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB23_0:
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: global_store_short v3, v0, s[6:7]
-; GFX90a-NEXT: v_mov_b32_e32 v0, s13
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NEXT: global_store_short v3, v0, s[8:9]
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v0, s3
+; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
store <7 x bfloat> %in2, ptr addrspace(1) %out2
@@ -976,17 +963,17 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1
;
; GFX90a-LABEL: i1_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB24_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB24_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 1
+; GFX90a-NEXT: s_and_b32 s0, s10, 1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_byte v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_byte v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store i1 %in, ptr addrspace(1) %out
ret void
@@ -1013,20 +1000,18 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: fp128_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB25_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB25_0:
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: v_mov_b32_e32 v3, s13
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: v_mov_b32_e32 v3, s15
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-NEXT: s_endpgm
store fp128 %in, ptr addrspace(1) %out
ret void
@@ -1059,26 +1044,25 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: v7i8_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB26_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB26_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s8, 24
+; GFX90a-NEXT: s_lshr_b32 s1, s10, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s0, s0, s1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[6:7] offset:6
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[8:9] offset:6
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <7 x i8> %in, ptr addrspace(1) %out
ret void
@@ -1106,21 +1090,19 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out
;
; GFX90a-LABEL: v7half_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB27_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB27_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s13
-; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s15
+; GFX90a-NEXT: global_store_short v3, v0, s[8:9] offset:12
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
store <7 x half> %in, ptr addrspace(1) %out
ret void
@@ -1145,18 +1127,18 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %ou
;
; GFX90a-LABEL: i16_i32_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB28_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB28_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_dword v0, v1, s[12:13]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i32 %in2, ptr addrspace(1) %out2
@@ -1184,22 +1166,22 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %
;
; GFX90a-LABEL: i16_v3i32_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB29_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB29_0:
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v4, s8
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: global_store_short v3, v4, s[6:7]
+; GFX90a-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
+; GFX90a-NEXT: v_mov_b32_e32 v4, s10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NEXT: global_store_short v3, v4, s[8:9]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <3 x i32> %in2, ptr addrspace(1) %out2
@@ -1224,17 +1206,17 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou
;
; GFX90a-LABEL: i16_i16_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB30_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB30_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[10:11]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[12:13]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i16 %in2, ptr addrspace(1) %out2
@@ -1264,22 +1246,22 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: i16_v2i8_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB31_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB31_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-NEXT: s_lshr_b32 s0, s10, 24
; GFX90a-NEXT: s_lshl_b32 s0, s0, 8
-; GFX90a-NEXT: s_bfe_u32 s1, s8, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s1, s10, 0x80010
; GFX90a-NEXT: s_or_b32 s0, s1, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_short v0, v1, s[10:11]
+; GFX90a-NEXT: global_store_short v0, v1, s[12:13]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <2 x i8> %in2, ptr addrspace(1) %out2
@@ -1308,7 +1290,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
;
; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB32_0
; GFX90a-NEXT: .p2align 8
@@ -1318,7 +1300,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_add_i32 s2, s6, s2
+; GFX90a-NEXT: s_add_i32 s2, s8, s2
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NEXT: s_endpgm
@@ -1345,17 +1327,16 @@ define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: ptr1_i8_trailing_unused:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB33_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB33_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 5474338514522..8f25e6519588b 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -6,6 +6,9 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b,
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
@@ -33,9 +36,12 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a
; GCN-NEXT: s_load_dword s2, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0x5a
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_sad_u32 v2, s2, v0, 20
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -57,6 +63,9 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
@@ -79,12 +88,14 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_sub_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_min_u32 s3, s0, s1
; GCN-NEXT: s_max_u32 s0, s0, s1
@@ -92,8 +103,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_add_i32 s0, s0, s2
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
@@ -115,19 +127,22 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_add_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_mov_b32_e32 v3, s2
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_sad_u32 v2, s0, v2, v3
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -147,21 +162,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i
define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_max_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_max_u32 s3, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_sad_u32 v3, s0, v0, v1
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_store_dword v[0:1], v3
; GCN-NEXT: s_endpgm
@@ -182,21 +200,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_min_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_min_u32 s3, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_sad_u32 v3, s0, v0, v1
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_store_dword v[0:1], v3
; GCN-NEXT: s_endpgm
@@ -218,21 +239,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_sub_pat2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s3, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_sad_u32 v3, s0, v0, v1
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_store_dword v[0:1], v3
; GCN-NEXT: s_endpgm
@@ -251,12 +275,14 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_select_pat2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_min_u32 s3, s0, s1
; GCN-NEXT: s_max_u32 s0, s0, s1
@@ -264,8 +290,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_add_i32 s0, s0, s2
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
@@ -285,6 +312,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; GCN-LABEL: v_sad_u32_vector_pat1:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc
; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
@@ -321,6 +351,9 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32
define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; GCN-LABEL: v_sad_u32_vector_pat2:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc
; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
@@ -358,6 +391,8 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
; GCN-NEXT: s_load_dword s4, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NEXT: s_lshr_b32 s0, s0, 16
@@ -365,6 +400,7 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_sad_u32 v2, s4, v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: flat_store_short v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -384,6 +420,9 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
; GCN-LABEL: v_sad_u32_i16_pat2:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -416,6 +455,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s3, s2, 0xff
; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008
@@ -443,6 +485,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
; GCN-LABEL: v_sad_u32_i8_pat2:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -475,6 +520,9 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s3, s2, 0xff
; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008
@@ -502,6 +550,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_max_u32 s6, s0, s1
; GCN-NEXT: s_cmp_le_u32 s0, s1
@@ -531,6 +582,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s3, s0, s3
; GCN-NEXT: s_sub_i32 s6, s1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index 884ba3fc34dff..29448ab2d822e 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -9,6 +9,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v5, s3
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -24,6 +26,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v5, s3
; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -39,6 +43,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX908: ; %bb.0: ; %entry
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, s3
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -55,6 +61,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -88,6 +96,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v5, s3
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -103,6 +113,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v5, s3
; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -118,6 +130,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX908: ; %bb.0: ; %entry
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, s3
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -134,6 +148,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index 0ad10437299f4..90dfd5a21d107 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -20,179 +20,183 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: ; def s[2:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[4:7]
-; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
; CHECK-NEXT: v_writelane_b32 v22, s2, 0
; CHECK-NEXT: v_writelane_b32 v22, s3, 1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[48:51]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[4:11]
+; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_writelane_b32 v22, s4, 2
; CHECK-NEXT: v_writelane_b32 v22, s5, 3
; CHECK-NEXT: v_writelane_b32 v22, s6, 4
-; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
; CHECK-NEXT: v_writelane_b32 v22, s7, 5
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[4:11]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s4, 6
-; CHECK-NEXT: v_writelane_b32 v22, s5, 7
-; CHECK-NEXT: v_writelane_b32 v22, s6, 8
-; CHECK-NEXT: v_writelane_b32 v22, s7, 9
-; CHECK-NEXT: v_writelane_b32 v22, s8, 10
-; CHECK-NEXT: v_writelane_b32 v22, s9, 11
-; CHECK-NEXT: v_writelane_b32 v22, s10, 12
-; CHECK-NEXT: v_writelane_b32 v22, s11, 13
+; CHECK-NEXT: v_writelane_b32 v22, s8, 6
+; CHECK-NEXT: v_writelane_b32 v22, s9, 7
+; CHECK-NEXT: v_writelane_b32 v22, s10, 8
+; CHECK-NEXT: v_writelane_b32 v22, s11, 9
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[4:19]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s4, 14
-; CHECK-NEXT: v_writelane_b32 v22, s5, 15
-; CHECK-NEXT: v_writelane_b32 v22, s6, 16
-; CHECK-NEXT: v_writelane_b32 v22, s7, 17
-; CHECK-NEXT: v_writelane_b32 v22, s8, 18
-; CHECK-NEXT: v_writelane_b32 v22, s9, 19
-; CHECK-NEXT: v_writelane_b32 v22, s10, 20
-; CHECK-NEXT: v_writelane_b32 v22, s11, 21
-; CHECK-NEXT: v_writelane_b32 v22, s12, 22
-; CHECK-NEXT: v_writelane_b32 v22, s13, 23
-; CHECK-NEXT: v_writelane_b32 v22, s14, 24
-; CHECK-NEXT: v_writelane_b32 v22, s15, 25
-; CHECK-NEXT: v_writelane_b32 v22, s16, 26
-; CHECK-NEXT: v_writelane_b32 v22, s17, 27
-; CHECK-NEXT: v_writelane_b32 v22, s18, 28
-; CHECK-NEXT: v_writelane_b32 v22, s19, 29
+; CHECK-NEXT: v_writelane_b32 v22, s4, 10
+; CHECK-NEXT: v_writelane_b32 v22, s5, 11
+; CHECK-NEXT: v_writelane_b32 v22, s6, 12
+; CHECK-NEXT: v_writelane_b32 v22, s7, 13
+; CHECK-NEXT: v_writelane_b32 v22, s8, 14
+; CHECK-NEXT: v_writelane_b32 v22, s9, 15
+; CHECK-NEXT: v_writelane_b32 v22, s10, 16
+; CHECK-NEXT: v_writelane_b32 v22, s11, 17
+; CHECK-NEXT: v_writelane_b32 v22, s12, 18
+; CHECK-NEXT: v_writelane_b32 v22, s13, 19
+; CHECK-NEXT: v_writelane_b32 v22, s14, 20
+; CHECK-NEXT: v_writelane_b32 v22, s15, 21
+; CHECK-NEXT: v_writelane_b32 v22, s16, 22
+; CHECK-NEXT: v_writelane_b32 v22, s17, 23
+; CHECK-NEXT: v_writelane_b32 v22, s18, 24
+; CHECK-NEXT: v_writelane_b32 v22, s19, 25
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[42:43]
+; CHECK-NEXT: ; def s[38:39]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[52:55]
+; CHECK-NEXT: ; def s[44:47]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[4:11]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s4, 30
-; CHECK-NEXT: v_writelane_b32 v22, s5, 31
-; CHECK-NEXT: v_writelane_b32 v22, s6, 32
-; CHECK-NEXT: v_writelane_b32 v22, s7, 33
-; CHECK-NEXT: v_writelane_b32 v22, s8, 34
-; CHECK-NEXT: v_writelane_b32 v22, s9, 35
-; CHECK-NEXT: v_writelane_b32 v22, s10, 36
-; CHECK-NEXT: v_writelane_b32 v22, s11, 37
+; CHECK-NEXT: v_writelane_b32 v22, s4, 26
+; CHECK-NEXT: v_writelane_b32 v22, s5, 27
+; CHECK-NEXT: v_writelane_b32 v22, s6, 28
+; CHECK-NEXT: v_writelane_b32 v22, s7, 29
+; CHECK-NEXT: v_writelane_b32 v22, s8, 30
+; CHECK-NEXT: v_writelane_b32 v22, s9, 31
+; CHECK-NEXT: v_writelane_b32 v22, s10, 32
+; CHECK-NEXT: v_writelane_b32 v22, s11, 33
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[16:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[40:41]
+; CHECK-NEXT: ; def s[36:37]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[36:39]
+; CHECK-NEXT: ; def s[40:43]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[44:51]
+; CHECK-NEXT: ; def s[0:7]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_writelane_b32 v22, s0, 34
+; CHECK-NEXT: v_writelane_b32 v22, s1, 35
+; CHECK-NEXT: v_writelane_b32 v22, s2, 36
+; CHECK-NEXT: v_writelane_b32 v22, s3, 37
+; CHECK-NEXT: v_writelane_b32 v22, s4, 38
+; CHECK-NEXT: v_writelane_b32 v22, s5, 39
+; CHECK-NEXT: v_writelane_b32 v22, s6, 40
+; CHECK-NEXT: v_writelane_b32 v22, s7, 41
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s0, 38
-; CHECK-NEXT: v_writelane_b32 v22, s1, 39
-; CHECK-NEXT: v_writelane_b32 v22, s2, 40
-; CHECK-NEXT: v_writelane_b32 v22, s3, 41
-; CHECK-NEXT: v_writelane_b32 v22, s4, 42
-; CHECK-NEXT: v_writelane_b32 v22, s5, 43
-; CHECK-NEXT: v_writelane_b32 v22, s6, 44
-; CHECK-NEXT: v_writelane_b32 v22, s7, 45
-; CHECK-NEXT: v_writelane_b32 v22, s8, 46
-; CHECK-NEXT: v_writelane_b32 v22, s9, 47
-; CHECK-NEXT: v_writelane_b32 v22, s10, 48
-; CHECK-NEXT: v_writelane_b32 v22, s11, 49
-; CHECK-NEXT: v_writelane_b32 v22, s12, 50
-; CHECK-NEXT: v_writelane_b32 v22, s13, 51
-; CHECK-NEXT: v_writelane_b32 v22, s14, 52
-; CHECK-NEXT: v_writelane_b32 v22, s15, 53
+; CHECK-NEXT: v_writelane_b32 v22, s0, 42
+; CHECK-NEXT: v_writelane_b32 v22, s1, 43
+; CHECK-NEXT: v_writelane_b32 v22, s2, 44
+; CHECK-NEXT: v_writelane_b32 v22, s3, 45
+; CHECK-NEXT: v_writelane_b32 v22, s4, 46
+; CHECK-NEXT: v_writelane_b32 v22, s5, 47
+; CHECK-NEXT: v_writelane_b32 v22, s6, 48
+; CHECK-NEXT: v_writelane_b32 v22, s7, 49
+; CHECK-NEXT: v_writelane_b32 v22, s8, 50
+; CHECK-NEXT: v_writelane_b32 v22, s9, 51
+; CHECK-NEXT: v_writelane_b32 v22, s10, 52
+; CHECK-NEXT: v_writelane_b32 v22, s11, 53
+; CHECK-NEXT: v_writelane_b32 v22, s12, 54
+; CHECK-NEXT: v_writelane_b32 v22, s13, 55
+; CHECK-NEXT: v_writelane_b32 v22, s14, 56
+; CHECK-NEXT: v_writelane_b32 v22, s15, 57
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[34:35]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s0, 54
-; CHECK-NEXT: v_writelane_b32 v22, s1, 55
-; CHECK-NEXT: v_writelane_b32 v22, s2, 56
-; CHECK-NEXT: v_writelane_b32 v22, s3, 57
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[0:7]
-; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_writelane_b32 v22, s0, 58
; CHECK-NEXT: v_writelane_b32 v22, s1, 59
; CHECK-NEXT: v_writelane_b32 v22, s2, 60
-; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v22, s3, 61
-; CHECK-NEXT: v_writelane_b32 v22, s4, 62
-; CHECK-NEXT: v_writelane_b32 v23, s6, 0
-; CHECK-NEXT: v_writelane_b32 v22, s5, 63
-; CHECK-NEXT: v_writelane_b32 v23, s7, 1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[0:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
+; CHECK-NEXT: v_writelane_b32 v22, s0, 62
+; CHECK-NEXT: v_writelane_b32 v23, s2, 0
+; CHECK-NEXT: v_writelane_b32 v23, s3, 1
+; CHECK-NEXT: v_writelane_b32 v23, s4, 2
+; CHECK-NEXT: v_writelane_b32 v23, s5, 3
+; CHECK-NEXT: v_writelane_b32 v23, s6, 4
+; CHECK-NEXT: v_writelane_b32 v22, s1, 63
+; CHECK-NEXT: v_writelane_b32 v23, s7, 5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 2
-; CHECK-NEXT: v_writelane_b32 v23, s1, 3
-; CHECK-NEXT: v_writelane_b32 v23, s2, 4
-; CHECK-NEXT: v_writelane_b32 v23, s3, 5
-; CHECK-NEXT: v_writelane_b32 v23, s4, 6
-; CHECK-NEXT: v_writelane_b32 v23, s5, 7
-; CHECK-NEXT: v_writelane_b32 v23, s6, 8
-; CHECK-NEXT: v_writelane_b32 v23, s7, 9
-; CHECK-NEXT: v_writelane_b32 v23, s8, 10
-; CHECK-NEXT: v_writelane_b32 v23, s9, 11
-; CHECK-NEXT: v_writelane_b32 v23, s10, 12
-; CHECK-NEXT: v_writelane_b32 v23, s11, 13
-; CHECK-NEXT: v_writelane_b32 v23, s12, 14
-; CHECK-NEXT: v_writelane_b32 v23, s13, 15
-; CHECK-NEXT: v_writelane_b32 v23, s14, 16
-; CHECK-NEXT: v_writelane_b32 v23, s15, 17
+; CHECK-NEXT: v_writelane_b32 v23, s0, 6
+; CHECK-NEXT: v_writelane_b32 v23, s1, 7
+; CHECK-NEXT: v_writelane_b32 v23, s2, 8
+; CHECK-NEXT: v_writelane_b32 v23, s3, 9
+; CHECK-NEXT: v_writelane_b32 v23, s4, 10
+; CHECK-NEXT: v_writelane_b32 v23, s5, 11
+; CHECK-NEXT: v_writelane_b32 v23, s6, 12
+; CHECK-NEXT: v_writelane_b32 v23, s7, 13
+; CHECK-NEXT: v_writelane_b32 v23, s8, 14
+; CHECK-NEXT: v_writelane_b32 v23, s9, 15
+; CHECK-NEXT: v_writelane_b32 v23, s10, 16
+; CHECK-NEXT: v_writelane_b32 v23, s11, 17
+; CHECK-NEXT: v_writelane_b32 v23, s12, 18
+; CHECK-NEXT: v_writelane_b32 v23, s13, 19
+; CHECK-NEXT: v_writelane_b32 v23, s14, 20
+; CHECK-NEXT: v_writelane_b32 v23, s15, 21
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 18
-; CHECK-NEXT: v_writelane_b32 v23, s1, 19
+; CHECK-NEXT: v_writelane_b32 v23, s0, 22
+; CHECK-NEXT: v_writelane_b32 v23, s1, 23
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 20
-; CHECK-NEXT: v_writelane_b32 v23, s1, 21
-; CHECK-NEXT: v_writelane_b32 v23, s2, 22
-; CHECK-NEXT: v_writelane_b32 v23, s3, 23
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[0:7]
-; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_writelane_b32 v23, s0, 24
; CHECK-NEXT: v_writelane_b32 v23, s1, 25
; CHECK-NEXT: v_writelane_b32 v23, s2, 26
; CHECK-NEXT: v_writelane_b32 v23, s3, 27
-; CHECK-NEXT: v_writelane_b32 v23, s4, 28
-; CHECK-NEXT: v_writelane_b32 v23, s5, 29
-; CHECK-NEXT: v_writelane_b32 v23, s6, 30
-; CHECK-NEXT: v_writelane_b32 v23, s7, 31
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[0:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_writelane_b32 v23, s0, 28
+; CHECK-NEXT: v_writelane_b32 v23, s1, 29
+; CHECK-NEXT: v_writelane_b32 v23, s2, 30
+; CHECK-NEXT: v_writelane_b32 v23, s3, 31
+; CHECK-NEXT: v_writelane_b32 v23, s4, 32
+; CHECK-NEXT: v_writelane_b32 v23, s5, 33
+; CHECK-NEXT: v_writelane_b32 v23, s6, 34
+; CHECK-NEXT: v_writelane_b32 v23, s7, 35
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 32
-; CHECK-NEXT: v_writelane_b32 v23, s1, 33
-; CHECK-NEXT: v_writelane_b32 v23, s2, 34
-; CHECK-NEXT: v_writelane_b32 v23, s3, 35
-; CHECK-NEXT: v_writelane_b32 v23, s4, 36
-; CHECK-NEXT: v_writelane_b32 v23, s5, 37
-; CHECK-NEXT: v_writelane_b32 v23, s6, 38
-; CHECK-NEXT: v_writelane_b32 v23, s7, 39
-; CHECK-NEXT: v_writelane_b32 v23, s8, 40
-; CHECK-NEXT: v_writelane_b32 v23, s9, 41
-; CHECK-NEXT: v_writelane_b32 v23, s10, 42
-; CHECK-NEXT: v_writelane_b32 v23, s11, 43
-; CHECK-NEXT: v_writelane_b32 v23, s12, 44
-; CHECK-NEXT: v_writelane_b32 v23, s13, 45
-; CHECK-NEXT: v_writelane_b32 v23, s14, 46
-; CHECK-NEXT: v_writelane_b32 v23, s15, 47
+; CHECK-NEXT: v_writelane_b32 v23, s0, 36
+; CHECK-NEXT: v_writelane_b32 v23, s1, 37
+; CHECK-NEXT: v_writelane_b32 v23, s2, 38
+; CHECK-NEXT: v_writelane_b32 v23, s3, 39
+; CHECK-NEXT: v_writelane_b32 v23, s4, 40
+; CHECK-NEXT: v_writelane_b32 v23, s5, 41
+; CHECK-NEXT: v_writelane_b32 v23, s6, 42
+; CHECK-NEXT: v_writelane_b32 v23, s7, 43
+; CHECK-NEXT: v_writelane_b32 v23, s8, 44
+; CHECK-NEXT: v_writelane_b32 v23, s9, 45
+; CHECK-NEXT: v_writelane_b32 v23, s10, 46
+; CHECK-NEXT: v_writelane_b32 v23, s11, 47
+; CHECK-NEXT: v_writelane_b32 v23, s12, 48
+; CHECK-NEXT: v_writelane_b32 v23, s13, 49
+; CHECK-NEXT: v_writelane_b32 v23, s14, 50
+; CHECK-NEXT: v_writelane_b32 v23, s15, 51
; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %ret
; CHECK-NEXT: s_endpgm
@@ -206,166 +210,170 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: v_readlane_b32 s1, v22, 3
; CHECK-NEXT: v_readlane_b32 s2, v22, 4
; CHECK-NEXT: v_readlane_b32 s3, v22, 5
+; CHECK-NEXT: v_readlane_b32 s4, v22, 6
+; CHECK-NEXT: v_readlane_b32 s5, v22, 7
+; CHECK-NEXT: v_readlane_b32 s6, v22, 8
+; CHECK-NEXT: v_readlane_b32 s7, v22, 9
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[0:3]
+; CHECK-NEXT: ; use s[48:51]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 6
-; CHECK-NEXT: v_readlane_b32 s1, v22, 7
-; CHECK-NEXT: v_readlane_b32 s2, v22, 8
-; CHECK-NEXT: v_readlane_b32 s3, v22, 9
-; CHECK-NEXT: v_readlane_b32 s4, v22, 10
-; CHECK-NEXT: v_readlane_b32 s5, v22, 11
-; CHECK-NEXT: v_readlane_b32 s6, v22, 12
-; CHECK-NEXT: v_readlane_b32 s7, v22, 13
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 14
-; CHECK-NEXT: v_readlane_b32 s1, v22, 15
-; CHECK-NEXT: v_readlane_b32 s2, v22, 16
-; CHECK-NEXT: v_readlane_b32 s3, v22, 17
-; CHECK-NEXT: v_readlane_b32 s4, v22, 18
-; CHECK-NEXT: v_readlane_b32 s5, v22, 19
-; CHECK-NEXT: v_readlane_b32 s6, v22, 20
-; CHECK-NEXT: v_readlane_b32 s7, v22, 21
-; CHECK-NEXT: v_readlane_b32 s8, v22, 22
-; CHECK-NEXT: v_readlane_b32 s9, v22, 23
-; CHECK-NEXT: v_readlane_b32 s10, v22, 24
-; CHECK-NEXT: v_readlane_b32 s11, v22, 25
-; CHECK-NEXT: v_readlane_b32 s12, v22, 26
-; CHECK-NEXT: v_readlane_b32 s13, v22, 27
-; CHECK-NEXT: v_readlane_b32 s14, v22, 28
-; CHECK-NEXT: v_readlane_b32 s15, v22, 29
+; CHECK-NEXT: v_readlane_b32 s0, v22, 10
+; CHECK-NEXT: v_readlane_b32 s1, v22, 11
+; CHECK-NEXT: v_readlane_b32 s2, v22, 12
+; CHECK-NEXT: v_readlane_b32 s3, v22, 13
+; CHECK-NEXT: v_readlane_b32 s4, v22, 14
+; CHECK-NEXT: v_readlane_b32 s5, v22, 15
+; CHECK-NEXT: v_readlane_b32 s6, v22, 16
+; CHECK-NEXT: v_readlane_b32 s7, v22, 17
+; CHECK-NEXT: v_readlane_b32 s8, v22, 18
+; CHECK-NEXT: v_readlane_b32 s9, v22, 19
+; CHECK-NEXT: v_readlane_b32 s10, v22, 20
+; CHECK-NEXT: v_readlane_b32 s11, v22, 21
+; CHECK-NEXT: v_readlane_b32 s12, v22, 22
+; CHECK-NEXT: v_readlane_b32 s13, v22, 23
+; CHECK-NEXT: v_readlane_b32 s14, v22, 24
+; CHECK-NEXT: v_readlane_b32 s15, v22, 25
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 30
-; CHECK-NEXT: v_readlane_b32 s1, v22, 31
-; CHECK-NEXT: v_readlane_b32 s2, v22, 32
-; CHECK-NEXT: v_readlane_b32 s3, v22, 33
-; CHECK-NEXT: v_readlane_b32 s4, v22, 34
-; CHECK-NEXT: v_readlane_b32 s5, v22, 35
-; CHECK-NEXT: v_readlane_b32 s6, v22, 36
-; CHECK-NEXT: v_readlane_b32 s7, v22, 37
+; CHECK-NEXT: v_readlane_b32 s0, v22, 26
+; CHECK-NEXT: v_readlane_b32 s1, v22, 27
+; CHECK-NEXT: v_readlane_b32 s2, v22, 28
+; CHECK-NEXT: v_readlane_b32 s3, v22, 29
+; CHECK-NEXT: v_readlane_b32 s4, v22, 30
+; CHECK-NEXT: v_readlane_b32 s5, v22, 31
+; CHECK-NEXT: v_readlane_b32 s6, v22, 32
+; CHECK-NEXT: v_readlane_b32 s7, v22, 33
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[42:43]
+; CHECK-NEXT: ; use s[38:39]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[52:55]
+; CHECK-NEXT: ; use s[44:47]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 38
-; CHECK-NEXT: v_readlane_b32 s1, v22, 39
-; CHECK-NEXT: v_readlane_b32 s2, v22, 40
-; CHECK-NEXT: v_readlane_b32 s3, v22, 41
+; CHECK-NEXT: v_readlane_b32 s0, v22, 34
+; CHECK-NEXT: v_readlane_b32 s1, v22, 35
+; CHECK-NEXT: v_readlane_b32 s2, v22, 36
+; CHECK-NEXT: v_readlane_b32 s3, v22, 37
+; CHECK-NEXT: v_readlane_b32 s4, v22, 38
+; CHECK-NEXT: v_readlane_b32 s5, v22, 39
+; CHECK-NEXT: v_readlane_b32 s6, v22, 40
+; CHECK-NEXT: v_readlane_b32 s7, v22, 41
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[16:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[40:41]
+; CHECK-NEXT: ; use s[36:37]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[36:39]
+; CHECK-NEXT: ; use s[40:43]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[44:51]
+; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s4, v22, 42
-; CHECK-NEXT: v_readlane_b32 s5, v22, 43
-; CHECK-NEXT: v_readlane_b32 s6, v22, 44
-; CHECK-NEXT: v_readlane_b32 s7, v22, 45
-; CHECK-NEXT: v_readlane_b32 s8, v22, 46
-; CHECK-NEXT: v_readlane_b32 s9, v22, 47
-; CHECK-NEXT: v_readlane_b32 s10, v22, 48
-; CHECK-NEXT: v_readlane_b32 s11, v22, 49
-; CHECK-NEXT: v_readlane_b32 s12, v22, 50
-; CHECK-NEXT: v_readlane_b32 s13, v22, 51
-; CHECK-NEXT: v_readlane_b32 s14, v22, 52
-; CHECK-NEXT: v_readlane_b32 s15, v22, 53
+; CHECK-NEXT: v_readlane_b32 s0, v22, 42
+; CHECK-NEXT: v_readlane_b32 s1, v22, 43
+; CHECK-NEXT: v_readlane_b32 s2, v22, 44
+; CHECK-NEXT: v_readlane_b32 s3, v22, 45
+; CHECK-NEXT: v_readlane_b32 s4, v22, 46
+; CHECK-NEXT: v_readlane_b32 s5, v22, 47
+; CHECK-NEXT: v_readlane_b32 s6, v22, 48
+; CHECK-NEXT: v_readlane_b32 s7, v22, 49
+; CHECK-NEXT: v_readlane_b32 s8, v22, 50
+; CHECK-NEXT: v_readlane_b32 s9, v22, 51
+; CHECK-NEXT: v_readlane_b32 s10, v22, 52
+; CHECK-NEXT: v_readlane_b32 s11, v22, 53
+; CHECK-NEXT: v_readlane_b32 s12, v22, 54
+; CHECK-NEXT: v_readlane_b32 s13, v22, 55
+; CHECK-NEXT: v_readlane_b32 s14, v22, 56
+; CHECK-NEXT: v_readlane_b32 s15, v22, 57
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 54
-; CHECK-NEXT: v_readlane_b32 s1, v22, 55
-; CHECK-NEXT: v_readlane_b32 s2, v22, 56
-; CHECK-NEXT: v_readlane_b32 s3, v22, 57
+; CHECK-NEXT: v_readlane_b32 s0, v22, 58
+; CHECK-NEXT: v_readlane_b32 s1, v22, 59
+; CHECK-NEXT: v_readlane_b32 s2, v22, 60
+; CHECK-NEXT: v_readlane_b32 s3, v22, 61
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[34:35]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 58
-; CHECK-NEXT: v_readlane_b32 s1, v22, 59
-; CHECK-NEXT: v_readlane_b32 s2, v22, 60
-; CHECK-NEXT: v_readlane_b32 s3, v22, 61
-; CHECK-NEXT: v_readlane_b32 s4, v22, 62
-; CHECK-NEXT: v_readlane_b32 s5, v22, 63
-; CHECK-NEXT: v_readlane_b32 s6, v23, 0
-; CHECK-NEXT: v_readlane_b32 s7, v23, 1
+; CHECK-NEXT: v_readlane_b32 s0, v22, 62
+; CHECK-NEXT: v_readlane_b32 s1, v22, 63
+; CHECK-NEXT: v_readlane_b32 s2, v23, 0
+; CHECK-NEXT: v_readlane_b32 s3, v23, 1
+; CHECK-NEXT: v_readlane_b32 s4, v23, 2
+; CHECK-NEXT: v_readlane_b32 s5, v23, 3
+; CHECK-NEXT: v_readlane_b32 s6, v23, 4
+; CHECK-NEXT: v_readlane_b32 s7, v23, 5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 2
-; CHECK-NEXT: v_readlane_b32 s1, v23, 3
-; CHECK-NEXT: v_readlane_b32 s2, v23, 4
-; CHECK-NEXT: v_readlane_b32 s3, v23, 5
-; CHECK-NEXT: v_readlane_b32 s4, v23, 6
-; CHECK-NEXT: v_readlane_b32 s5, v23, 7
-; CHECK-NEXT: v_readlane_b32 s6, v23, 8
-; CHECK-NEXT: v_readlane_b32 s7, v23, 9
-; CHECK-NEXT: v_readlane_b32 s8, v23, 10
-; CHECK-NEXT: v_readlane_b32 s9, v23, 11
-; CHECK-NEXT: v_readlane_b32 s10, v23, 12
-; CHECK-NEXT: v_readlane_b32 s11, v23, 13
-; CHECK-NEXT: v_readlane_b32 s12, v23, 14
-; CHECK-NEXT: v_readlane_b32 s13, v23, 15
-; CHECK-NEXT: v_readlane_b32 s14, v23, 16
-; CHECK-NEXT: v_readlane_b32 s15, v23, 17
+; CHECK-NEXT: v_readlane_b32 s0, v23, 6
+; CHECK-NEXT: v_readlane_b32 s1, v23, 7
+; CHECK-NEXT: v_readlane_b32 s2, v23, 8
+; CHECK-NEXT: v_readlane_b32 s3, v23, 9
+; CHECK-NEXT: v_readlane_b32 s4, v23, 10
+; CHECK-NEXT: v_readlane_b32 s5, v23, 11
+; CHECK-NEXT: v_readlane_b32 s6, v23, 12
+; CHECK-NEXT: v_readlane_b32 s7, v23, 13
+; CHECK-NEXT: v_readlane_b32 s8, v23, 14
+; CHECK-NEXT: v_readlane_b32 s9, v23, 15
+; CHECK-NEXT: v_readlane_b32 s10, v23, 16
+; CHECK-NEXT: v_readlane_b32 s11, v23, 17
+; CHECK-NEXT: v_readlane_b32 s12, v23, 18
+; CHECK-NEXT: v_readlane_b32 s13, v23, 19
+; CHECK-NEXT: v_readlane_b32 s14, v23, 20
+; CHECK-NEXT: v_readlane_b32 s15, v23, 21
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 18
-; CHECK-NEXT: v_readlane_b32 s1, v23, 19
+; CHECK-NEXT: v_readlane_b32 s0, v23, 22
+; CHECK-NEXT: v_readlane_b32 s1, v23, 23
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 20
-; CHECK-NEXT: v_readlane_b32 s1, v23, 21
-; CHECK-NEXT: v_readlane_b32 s2, v23, 22
-; CHECK-NEXT: v_readlane_b32 s3, v23, 23
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[0:3]
-; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_readlane_b32 s0, v23, 24
; CHECK-NEXT: v_readlane_b32 s1, v23, 25
; CHECK-NEXT: v_readlane_b32 s2, v23, 26
; CHECK-NEXT: v_readlane_b32 s3, v23, 27
-; CHECK-NEXT: v_readlane_b32 s4, v23, 28
-; CHECK-NEXT: v_readlane_b32 s5, v23, 29
-; CHECK-NEXT: v_readlane_b32 s6, v23, 30
-; CHECK-NEXT: v_readlane_b32 s7, v23, 31
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_readlane_b32 s0, v23, 28
+; CHECK-NEXT: v_readlane_b32 s1, v23, 29
+; CHECK-NEXT: v_readlane_b32 s2, v23, 30
+; CHECK-NEXT: v_readlane_b32 s3, v23, 31
+; CHECK-NEXT: v_readlane_b32 s4, v23, 32
+; CHECK-NEXT: v_readlane_b32 s5, v23, 33
+; CHECK-NEXT: v_readlane_b32 s6, v23, 34
+; CHECK-NEXT: v_readlane_b32 s7, v23, 35
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 32
-; CHECK-NEXT: v_readlane_b32 s1, v23, 33
-; CHECK-NEXT: v_readlane_b32 s2, v23, 34
-; CHECK-NEXT: v_readlane_b32 s3, v23, 35
-; CHECK-NEXT: v_readlane_b32 s4, v23, 36
-; CHECK-NEXT: v_readlane_b32 s5, v23, 37
-; CHECK-NEXT: v_readlane_b32 s6, v23, 38
-; CHECK-NEXT: v_readlane_b32 s7, v23, 39
-; CHECK-NEXT: v_readlane_b32 s8, v23, 40
-; CHECK-NEXT: v_readlane_b32 s9, v23, 41
-; CHECK-NEXT: v_readlane_b32 s10, v23, 42
-; CHECK-NEXT: v_readlane_b32 s11, v23, 43
-; CHECK-NEXT: v_readlane_b32 s12, v23, 44
-; CHECK-NEXT: v_readlane_b32 s13, v23, 45
-; CHECK-NEXT: v_readlane_b32 s14, v23, 46
-; CHECK-NEXT: v_readlane_b32 s15, v23, 47
+; CHECK-NEXT: v_readlane_b32 s0, v23, 36
+; CHECK-NEXT: v_readlane_b32 s1, v23, 37
+; CHECK-NEXT: v_readlane_b32 s2, v23, 38
+; CHECK-NEXT: v_readlane_b32 s3, v23, 39
+; CHECK-NEXT: v_readlane_b32 s4, v23, 40
+; CHECK-NEXT: v_readlane_b32 s5, v23, 41
+; CHECK-NEXT: v_readlane_b32 s6, v23, 42
+; CHECK-NEXT: v_readlane_b32 s7, v23, 43
+; CHECK-NEXT: v_readlane_b32 s8, v23, 44
+; CHECK-NEXT: v_readlane_b32 s9, v23, 45
+; CHECK-NEXT: v_readlane_b32 s10, v23, 46
+; CHECK-NEXT: v_readlane_b32 s11, v23, 47
+; CHECK-NEXT: v_readlane_b32 s12, v23, 48
+; CHECK-NEXT: v_readlane_b32 s13, v23, 49
+; CHECK-NEXT: v_readlane_b32 s14, v23, 50
+; CHECK-NEXT: v_readlane_b32 s15, v23, 51
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index 455d22f2aa29c..cdfba3cf0db7f 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -7,7 +7,7 @@
define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 {
; GCN-LABEL: partial_no_vgprs_last_sgpr_spill:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 s0, s0, s15
+; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_load_dword s4, s[8:9], 0x2
; GCN-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index a423b6f831a9d..65a17ed67481c 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -182,8 +182,10 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-LABEL: s_shl_i128_ss:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s5, s4, 64
; GCN-NEXT: s_sub_i32 s12, 64, s4
@@ -203,6 +205,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -215,8 +218,10 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-LABEL: s_lshr_i128_ss:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s5, s4, 64
; GCN-NEXT: s_sub_i32 s12, 64, s4
@@ -236,6 +241,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -248,8 +254,10 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-LABEL: s_ashr_i128_ss:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s5, 64, s4
; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
@@ -270,6 +278,7 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -430,6 +439,9 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: s_shl_v2i128ss:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v6, 16
; GCN-NEXT: v_mov_b32_e32 v4, 0
@@ -502,6 +514,9 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: s_lshr_v2i128_ss:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v6, 16
; GCN-NEXT: v_mov_b32_e32 v4, 0
@@ -574,6 +589,9 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: s_ashr_v2i128_ss:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v6, 16
; GCN-NEXT: v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 8531b2ad4e405..3c47e2504747d 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
@@ -7,9 +6,6 @@
target datalayout = "A5"
define internal void @indirect() {
-; AKF_GCN-LABEL: define {{[^@]+}}@indirect() {
-; AKF_GCN-NEXT: ret void
-;
; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@indirect
; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] {
; ATTRIBUTOR_GCN-NEXT: ret void
@@ -22,15 +18,6 @@ define internal void @indirect() {
}
define amdgpu_kernel void @test_simple_indirect_call() {
-; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
-; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] {
-; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
-; AKF_GCN-NEXT: [[FPTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR]] to ptr
-; AKF_GCN-NEXT: store ptr @indirect, ptr [[FPTR_CAST]], align 8
-; AKF_GCN-NEXT: [[FP:%.*]] = load ptr, ptr [[FPTR_CAST]], align 8
-; AKF_GCN-NEXT: call void [[FP]]()
-; AKF_GCN-NEXT: ret void
-;
; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] {
; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -79,12 +66,10 @@ define amdgpu_kernel void @test_simple_indirect_call() {
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
-; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" }
;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
-; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
; ATTRIBUTOR_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 6b40df0345ebe..46f257eff1f24 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -9,6 +9,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -20,6 +23,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -38,11 +44,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -52,11 +61,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -72,6 +84,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_bitcmp1_b32 s2, 0
; CI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -86,6 +101,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s2, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -104,6 +122,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; CI-LABEL: s_sint_to_fp_i64_to_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3
; CI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -117,6 +138,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; VI-LABEL: s_sint_to_fp_i64_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -136,6 +160,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
@@ -155,6 +182,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -183,6 +213,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i8 s2, s2
; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
@@ -195,6 +228,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s2, s2, 0x80000
; VI-NEXT: s_sext_i32_i16 s2, s2
@@ -232,11 +268,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -246,11 +285,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -283,11 +325,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -297,11 +342,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -353,11 +401,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0, 0xbff00000
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -367,11 +418,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0, 0xbff00000
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 5ae339454a0ba..9974d78af7ddf 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %13.sub0
+ ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %14.sub0
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %13
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index f791135d45e9a..ef92cf3214e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -50,7 +50,10 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 {
define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
; HAWAII-LABEL: local_store_i55:
; HAWAII: ; %bb.0:
+; HAWAII-NEXT: s_add_i32 s12, s12, s17
; HAWAII-NEXT: s_or_b32 s0, s8, 14
+; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s9
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
@@ -70,7 +73,10 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
;
; FIJI-LABEL: local_store_i55:
; FIJI: ; %bb.0:
+; FIJI-NEXT: s_add_i32 s12, s12, s17
; FIJI-NEXT: s_or_b32 s0, s8, 14
+; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s9
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 19d633651fdd0..30accc846d2b6 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @...............
+; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 9
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 2097579e0c995..4f84b31f1877b 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000 ................
+; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 5
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 775c62e73261a..644f434923368 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @...............
+; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 9
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index b8f0d7617167e..69cc63eba6243 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -23,11 +23,14 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
; HSA-TRAP-GFX803-LABEL: trap:
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7]
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3
+; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7]
; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX803-NEXT: s_trap 2
@@ -121,6 +124,9 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
; HSA-TRAP-GFX803-LABEL: non_entry_trap:
; HSA-TRAP-GFX803: ; %bb.0: ; %entry
; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1
@@ -280,6 +286,9 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7]
; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5
@@ -411,10 +420,13 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
; HSA-TRAP-GFX803-LABEL: debugtrap:
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1
; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 6045d423c6bad..b1111876f0280 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -81,6 +81,9 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-LABEL: udiv_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -252,6 +255,9 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
; GCN-LABEL: s_udiv_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3
; GCN-NEXT: s_sub_i32 s4, 0, s3
@@ -457,6 +463,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: udiv_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -810,6 +819,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: udiv_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s2, 16
; GCN-NEXT: s_addc_u32 s5, s3, 0
@@ -1135,6 +1147,9 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
; GCN-LABEL: udiv_i32_div_pow2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1224,6 +1239,9 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: udiv_i32_div_k_even:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1318,6 +1336,9 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; GCN-LABEL: udiv_i32_div_k_odd:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1430,6 +1451,9 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; GCN-LABEL: v_udiv_i8:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1570,6 +1594,9 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: v_udiv_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1726,6 +1753,9 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: v_udiv_i23:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s2, 4
; GCN-NEXT: s_addc_u32 s5, s3, 0
@@ -1923,6 +1953,9 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: v_udiv_i24:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s2, 4
; GCN-NEXT: s_addc_u32 s5, s3, 0
@@ -2105,6 +2138,9 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; GCN-LABEL: scalarize_mulhu_4xi32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
@@ -2218,6 +2254,9 @@ define amdgpu_kernel void @test_udiv2(i32 %p) {
; GCN-LABEL: test_udiv2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshr_b32 s0, s0, 1
; GCN-NEXT: v_mov_b32_e32 v0, s0
@@ -2281,6 +2320,9 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 55cbc14a46706..97738a7944741 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -9,6 +9,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
@@ -28,6 +31,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -54,6 +60,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; SI-LABEL: s_uint_to_fp_i64_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -67,6 +76,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; VI-LABEL: s_uint_to_fp_i64_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -86,6 +98,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1
@@ -103,6 +118,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1
@@ -128,6 +146,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8
; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s1
@@ -160,6 +181,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s7
; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s5
@@ -196,6 +220,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; SI-NEXT: v_mov_b32_e32 v3, s1
@@ -207,6 +234,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -222,6 +252,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3
; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
@@ -239,6 +272,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3
@@ -259,6 +295,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3
@@ -286,11 +325,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -300,11 +342,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -320,6 +365,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s2, 0
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -334,6 +382,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s2, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -353,6 +404,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s2, s2, 0xff
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
@@ -365,6 +419,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xff
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
@@ -402,11 +459,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -416,11 +476,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -453,11 +516,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -467,11 +533,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -505,11 +574,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -519,11 +591,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 45ea6b62761cc..ab7e85fdff516 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -11,7 +11,7 @@
define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() {
; CHECK-LABEL: __omp_offloading_16_dd2df_main_l9:
; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_add_u32 s0, s0, s15
+; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index b2f299d531f5c..3420707963db2 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -25,8 +25,9 @@
; CHECK-NEXT: argumentInfo:
; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; CHECK-NEXT: workGroupIDX: { reg: '$sgpr8' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
@@ -41,7 +42,7 @@
; CHECK-NEXT: BitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 93f2c343cd051..720631a301192 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -25,8 +25,9 @@
; CHECK-NEXT: argumentInfo:
; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; CHECK-NEXT: workGroupIDX: { reg: '$sgpr8' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
@@ -41,7 +42,7 @@
; CHECK-NEXT: BitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
>From df86e8206d9861abddb68767ab6d1bcab053e1d3 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Thu, 6 Mar 2025 15:03:37 -0800
Subject: [PATCH 2/6] Fix formatting.
---
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index a108300e336ce..85c8b4183166a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -623,14 +623,16 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
DispatchID = true;
}
- const bool IsNoFlatScratchInitSet = F.hasFnAttribute("amdgpu-no-flat-scratch-init");
+ const bool IsNoFlatScratchInitSet =
+ F.hasFnAttribute("amdgpu-no-flat-scratch-init");
if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
(IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
// The line below: If enableFlatScratch() is true, whether
// no-flat-scratch-init is set is not important. If enableFlatScratch()
// is false, FlatScratchInit cannot be true for graphics CC.
- (ST.enableFlatScratch() || (!IsNoFlatScratchInitSet && !AMDGPU::isGraphics(CC))) &&
+ (ST.enableFlatScratch() ||
+ (!IsNoFlatScratchInitSet && !AMDGPU::isGraphics(CC))) &&
!ST.flatScratchIsArchitected()) {
FlatScratchInit = true;
}
>From 34a39fd4d713701e6474a485fc49b2b75ef9d469 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Thu, 13 Mar 2025 16:47:45 -0700
Subject: [PATCH 3/6] Update test files.
As amdgpu-no-flat-scratch-init is set by opt, in most llc tests the
attribute is not set. This commit corrects this.
---
.../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 370 +---
.../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 392 +----
.../AMDGPU/GlobalISel/extractelement.ll | 79 +-
...licit-kernarg-backend-usage-global-isel.ll | 30 +-
.../GlobalISel/insertelement-stack-lower.ll | 4 +-
.../AMDGPU/GlobalISel/lds-global-value.ll | 7 +-
.../GlobalISel/llvm.amdgcn.if.break.i64.ll | 7 +-
.../GlobalISel/llvm.amdgcn.trig.preop.ll | 29 +-
.../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 57 +-
.../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 57 +-
.../abi-attribute-hints-undefined-behavior.ll | 42 +-
llvm/test/CodeGen/AMDGPU/always-uniform.ll | 7 +-
...amdgpu-codegenprepare-fold-binop-select.ll | 6 +-
.../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll | 8 +-
.../AMDGPU/attr-amdgpu-waves-per-eu.ll | 26 +-
.../attributor-flatscratchinit-invalid.ll | 550 ++++++
llvm/test/CodeGen/AMDGPU/attributor-noopt.ll | 6 +-
llvm/test/CodeGen/AMDGPU/code-object-v3.ll | 20 +-
.../CodeGen/AMDGPU/combine-reg-or-const.ll | 5 +-
...dagcomb-extract-vec-elt-different-sizes.ll | 6 +-
.../expand-scalar-carry-out-select-user.ll | 7 +-
.../CodeGen/AMDGPU/extract_vector_elt-i8.ll | 102 +-
llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 81 +-
.../fast-unaligned-load-store.global.ll | 21 +-
llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 250 +--
.../flat-for-global-subtarget-feature.ll | 11 +-
.../AMDGPU/fmul-2-combine-multi-use.ll | 50 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 82 +-
.../CodeGen/AMDGPU/fneg-modifier-casting.ll | 93 +-
llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 64 +-
llvm/test/CodeGen/AMDGPU/half.ll | 233 +--
.../AMDGPU/hsa-metadata-kernel-code-props.ll | 11 +-
llvm/test/CodeGen/AMDGPU/hsa.ll | 6 +-
.../AMDGPU/implicit-kernarg-backend-usage.ll | 46 +-
llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll | 32 +-
.../AMDGPU/insert_vector_elt.v2bf16.ll | 68 +-
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 228 +--
.../CodeGen/AMDGPU/invalid-addrspacecast.ll | 7 +-
.../CodeGen/AMDGPU/invalid-cast-load-i1.ll | 5 +-
.../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll | 18 +-
.../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll | 18 +-
.../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 39 +-
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 85 +-
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 116 +-
.../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 128 +-
llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 10 +-
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 143 +-
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 85 +-
llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 20 +-
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 166 +-
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 148 +-
llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 107 +-
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 304 ++--
.../AMDGPU/memory-legalizer-flat-agent.ll | 1566 +---------------
.../memory-legalizer-flat-nontemporal.ll | 86 +-
.../memory-legalizer-flat-singlethread.ll | 1567 +----------------
.../AMDGPU/memory-legalizer-flat-system.ll | 1566 +---------------
.../AMDGPU/memory-legalizer-flat-volatile.ll | 80 +-
.../AMDGPU/memory-legalizer-flat-wavefront.ll | 1549 +---------------
.../AMDGPU/memory-legalizer-flat-workgroup.ll | 1498 +---------------
.../AMDGPU/memory-legalizer-global-agent.ll | 457 +----
.../memory-legalizer-global-nontemporal.ll | 27 +-
.../memory-legalizer-global-singlethread.ll | 462 +----
.../AMDGPU/memory-legalizer-global-system.ll | 437 +----
.../memory-legalizer-global-volatile.ll | 31 +-
.../memory-legalizer-global-wavefront.ll | 462 +----
.../memory-legalizer-global-workgroup.ll | 462 +----
.../memory-legalizer-local-nontemporal.ll | 20 +-
.../AMDGPU/memory-legalizer-local-volatile.ll | 19 +-
.../memory-legalizer-private-nontemporal.ll | 70 +-
.../memory-legalizer-private-volatile.ll | 39 +-
llvm/test/CodeGen/AMDGPU/min.ll | 212 +--
llvm/test/CodeGen/AMDGPU/pack.v2f16.ll | 23 +-
llvm/test/CodeGen/AMDGPU/pack.v2i16.ll | 20 +-
...al-regcopy-and-spill-missed-at-regalloc.ll | 26 +-
.../AMDGPU/preload-implicit-kernargs.ll | 170 +-
llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 379 ++--
llvm/test/CodeGen/AMDGPU/sad.ll | 151 +-
.../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll | 18 +-
.../scc-clobbered-sgpr-to-vmem-spill.ll | 466 +++--
llvm/test/CodeGen/AMDGPU/shift-i128.ll | 61 +-
llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 98 +-
.../CodeGen/AMDGPU/spill-vector-superclass.ll | 4 +-
llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 8 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll | 4 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll | 4 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll | 4 +-
llvm/test/CodeGen/AMDGPU/trap-abis.ll | 25 +-
llvm/test/CodeGen/AMDGPU/udiv.ll | 76 +-
llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 127 +-
...ine-function-info-long-branch-reg-debug.ll | 9 +-
.../machine-function-info-long-branch-reg.ll | 9 +-
92 files changed, 2913 insertions(+), 13871 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid.ll
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index ac24f81136fd6..8654d0f789fac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -20,14 +20,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -38,14 +35,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -103,14 +97,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -121,14 +112,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -299,9 +287,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_dec_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -317,9 +302,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_dec_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -377,9 +359,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_dec_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -397,9 +376,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_dec_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -460,9 +436,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
; CI-LABEL: global_atomic_dec_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -480,9 +453,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
; VI-LABEL: global_atomic_dec_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -543,9 +513,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_dec_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -558,9 +525,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_dec_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -611,9 +575,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_dec_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -628,9 +589,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_dec_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -684,9 +642,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
; CI-LABEL: global_atomic_dec_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -701,9 +656,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
; VI-LABEL: global_atomic_dec_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -758,9 +710,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -768,7 +718,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -783,9 +732,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -793,7 +740,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -856,9 +802,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -876,9 +819,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -938,9 +878,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -956,9 +893,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -974,8 +908,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -990,10 +922,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_ret_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1030,9 +958,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -1050,9 +975,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -1070,8 +992,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1086,10 +1006,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1129,9 +1045,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_dec_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -1149,9 +1062,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_dec_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -1169,8 +1079,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1185,10 +1093,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1228,9 +1132,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1243,9 +1144,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1258,8 +1156,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -1271,10 +1167,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1307,9 +1199,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -1324,9 +1213,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -1341,8 +1227,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -1354,10 +1238,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1393,9 +1273,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -1410,9 +1287,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -1427,8 +1301,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -1440,10 +1312,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1480,9 +1348,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1490,7 +1356,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -1505,9 +1370,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1515,7 +1378,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -1530,8 +1392,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1550,10 +1410,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 42
@@ -1610,9 +1466,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -1630,9 +1483,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1650,8 +1500,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
@@ -1665,10 +1513,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1715,13 +1559,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1739,13 +1580,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1763,9 +1601,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_ret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -1780,10 +1616,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_ret_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1822,15 +1654,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1848,15 +1677,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1874,9 +1700,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_ret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -1891,10 +1715,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_ret_i64_offset:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1936,13 +1756,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1952,13 +1769,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1968,9 +1782,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -1982,10 +1794,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -2020,15 +1828,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2038,15 +1843,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2056,9 +1858,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -2070,10 +1870,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i64_offset:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -2111,15 +1907,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2129,15 +1922,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2147,9 +1937,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -2161,10 +1949,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -2203,9 +1987,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2232,9 +2013,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2261,14 +2039,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2282,10 +2058,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2344,9 +2116,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -2365,9 +2134,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2386,14 +2152,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2402,10 +2166,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2459,11 +2219,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_store_dword v[0:1], v3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2480,11 +2237,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2558,10 +2312,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -2577,10 +2328,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -2646,10 +2394,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -2665,10 +2410,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -2852,13 +2594,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_dec_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2871,13 +2610,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_dec_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2935,15 +2671,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_dec_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2956,15 +2689,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_dec_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -3023,15 +2753,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
; CI-LABEL: global_atomic_dec_ret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -3044,15 +2771,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
; VI-LABEL: global_atomic_dec_ret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -3111,13 +2835,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_dec_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -3127,13 +2848,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_dec_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -3184,15 +2902,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_dec_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -3202,15 +2917,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_dec_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -3262,15 +2974,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
; CI-LABEL: global_atomic_dec_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -3280,15 +2989,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
; VI-LABEL: global_atomic_dec_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -3341,9 +3047,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -3367,9 +3070,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -3444,9 +3144,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -3465,9 +3162,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -3538,10 +3232,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v4, s3
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: flat_store_dword v[3:4], v0
@@ -3560,10 +3251,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dword v[3:4], v0
@@ -3631,7 +3319,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
}
attributes #0 = { nounwind speculatable willreturn memory(none) }
-attributes #1 = { nounwind }
+attributes #1 = { nounwind "amdgpu-no-flat-scratch-init"}
attributes #2 = { nounwind memory(none) }
!0 = !{i32 5, i32 6}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index 23c267e7d184e..626ca2690f5fd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -21,14 +21,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -39,14 +36,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -116,14 +110,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -134,14 +125,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -344,9 +332,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_inc_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -362,9 +347,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_inc_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -433,9 +415,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_inc_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -453,9 +432,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_inc_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -527,9 +503,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -547,9 +520,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -622,9 +592,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_inc_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -637,9 +604,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_inc_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -700,9 +664,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_inc_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -717,9 +678,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_inc_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -783,9 +741,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
; CI-LABEL: global_atomic_inc_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -800,9 +755,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
; VI-LABEL: global_atomic_inc_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -868,9 +820,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -878,7 +828,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -893,9 +842,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -903,7 +850,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -979,9 +925,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -999,9 +942,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1079,11 +1019,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_store_dword v[0:1], v3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1100,11 +1037,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1195,10 +1129,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -1214,10 +1145,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1296,10 +1224,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -1315,10 +1240,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1537,13 +1459,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_inc_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1556,13 +1475,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_inc_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1632,15 +1548,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_inc_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1653,15 +1566,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_inc_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1732,15 +1642,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
; CI-LABEL: global_atomic_inc_ret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1753,15 +1660,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
; VI-LABEL: global_atomic_inc_ret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1833,13 +1737,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_inc_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1849,13 +1750,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_inc_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1917,15 +1815,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_inc_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1935,15 +1830,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_inc_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2006,15 +1898,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
; CI-LABEL: global_atomic_inc_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2024,15 +1913,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
; VI-LABEL: global_atomic_inc_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2097,9 +1983,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2123,9 +2006,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2214,9 +2094,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -2235,9 +2112,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2314,9 +2188,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -2332,9 +2203,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -2350,8 +2218,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2366,10 +2232,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_ret_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2419,9 +2281,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -2439,9 +2298,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -2459,8 +2315,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2475,10 +2329,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_ret_i32_offset:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2531,9 +2381,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_inc_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -2551,9 +2398,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_inc_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -2571,8 +2415,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2587,10 +2429,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2644,9 +2482,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2659,9 +2494,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2674,8 +2506,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -2687,10 +2517,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2734,9 +2560,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -2751,9 +2574,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -2768,8 +2588,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -2781,10 +2599,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2831,9 +2645,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -2848,9 +2659,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -2865,8 +2673,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -2878,10 +2684,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2930,9 +2732,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2940,7 +2740,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -2955,9 +2754,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2965,7 +2762,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -2980,8 +2776,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -3000,10 +2794,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 42
@@ -3081,9 +2871,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -3101,9 +2888,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -3121,8 +2905,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
@@ -3136,10 +2918,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -3210,10 +2988,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v4, s3
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: flat_store_dword v[3:4], v0
@@ -3232,10 +3007,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dword v[3:4], v0
@@ -3325,13 +3097,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3349,13 +3118,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3373,9 +3139,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_ret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -3390,10 +3154,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_ret_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3446,15 +3206,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3472,15 +3229,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3498,9 +3252,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_ret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -3515,10 +3267,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_ret_i64_offset:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3574,15 +3322,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_inc_ret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3600,15 +3345,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_inc_ret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3626,9 +3368,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -3643,10 +3383,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3703,13 +3439,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3719,13 +3452,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3735,9 +3465,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -3749,10 +3477,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3799,15 +3523,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3817,15 +3538,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3835,9 +3553,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -3849,10 +3565,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3902,15 +3614,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3920,15 +3629,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3938,9 +3644,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -3952,10 +3656,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -4007,9 +3707,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -4036,9 +3733,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -4065,14 +3759,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4086,10 +3778,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -4170,9 +3858,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -4191,9 +3876,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -4212,14 +3894,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4228,10 +3908,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -4299,7 +3975,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s4
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0
@@ -4307,8 +3982,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -4322,7 +3995,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0
@@ -4330,8 +4002,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -4419,7 +4089,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
}
attributes #0 = { nounwind speculatable willreturn memory(none) }
-attributes #1 = { nounwind }
+attributes #1 = { nounwind "amdgpu-no-flat-scratch-init" }
attributes #2 = { nounwind memory(none) }
!0 = !{i32 5, i32 6}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 9ef16aef0dd16..1e3163e584ce1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3004,7 +3004,7 @@ entry:
ret double %ext
}
-define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel) {
+define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel) #0 {
; GPRIDX-LABEL: dyn_extract_v5f64_s_s:
; GPRIDX: .amd_kernel_code_t
; GPRIDX-NEXT: amd_code_version_major = 1
@@ -3016,7 +3016,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -3027,7 +3027,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT: user_sgpr_count = 14
+; GPRIDX-NEXT: user_sgpr_count = 12
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -3042,7 +3042,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1
; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1
+; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 17
+; GPRIDX-NEXT: wavefront_sgpr_count = 15
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -3107,7 +3107,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -3118,7 +3118,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT: user_sgpr_count = 14
+; MOVREL-NEXT: user_sgpr_count = 12
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -3133,7 +3133,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_sgpr_queue_ptr = 1
; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; MOVREL-NEXT: enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1
+; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0
; MOVREL-NEXT: enable_sgpr_private_segment_size = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -3150,7 +3150,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 24
+; MOVREL-NEXT: wavefront_sgpr_count = 10
; MOVREL-NEXT: workitem_vgpr_count = 4
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
@@ -3168,24 +3168,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; MOVREL-NEXT: s_load_dword s8, s[8:9], 0x8
-; MOVREL-NEXT: s_add_i32 s12, s12, s17
-; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; MOVREL-NEXT: s_mov_b32 s4, 0
; MOVREL-NEXT: s_mov_b32 s5, 0x40080000
+; MOVREL-NEXT: s_mov_b32 s2, 0
+; MOVREL-NEXT: s_mov_b32 s3, 0x40140000
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s8, 1
; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
; MOVREL-NEXT: s_cmp_eq_u32 s8, 2
-; MOVREL-NEXT: s_mov_b32 s2, 0
; MOVREL-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
; MOVREL-NEXT: s_cmp_eq_u32 s8, 3
-; MOVREL-NEXT: s_mov_b32 s3, 0x40140000
; MOVREL-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5]
; MOVREL-NEXT: s_cmp_eq_u32 s8, 4
; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; MOVREL-NEXT: v_mov_b32_e32 v0, s2
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
-; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13
; MOVREL-NEXT: v_mov_b32_e32 v1, s3
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -3213,7 +3210,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT: user_sgpr_count = 14
+; GFX10-NEXT: user_sgpr_count = 12
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -3228,7 +3225,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_sgpr_queue_ptr = 1
; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GFX10-NEXT: enable_sgpr_dispatch_id = 1
-; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1
+; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0
; GFX10-NEXT: enable_sgpr_private_segment_size = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4022,7 +4019,7 @@ entry:
ret float %ext
}
-define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %sel) {
+define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %sel) #0 {
; GPRIDX-LABEL: dyn_extract_v4f32_s_s_s:
; GPRIDX: .amd_kernel_code_t
; GPRIDX-NEXT: amd_code_version_major = 1
@@ -4045,7 +4042,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT: user_sgpr_count = 14
+; GPRIDX-NEXT: user_sgpr_count = 12
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4060,7 +4057,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1
; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1
+; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4077,7 +4074,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 16
+; GPRIDX-NEXT: wavefront_sgpr_count = 14
; GPRIDX-NEXT: workitem_vgpr_count = 2
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -4118,7 +4115,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4129,7 +4126,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT: user_sgpr_count = 14
+; MOVREL-NEXT: user_sgpr_count = 12
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4144,7 +4141,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_sgpr_queue_ptr = 1
; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; MOVREL-NEXT: enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1
+; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0
; MOVREL-NEXT: enable_sgpr_private_segment_size = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4161,7 +4158,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 24
+; MOVREL-NEXT: wavefront_sgpr_count = 10
; MOVREL-NEXT: workitem_vgpr_count = 3
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
@@ -4179,9 +4176,6 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dword s2, s[8:9], 0x8
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; MOVREL-NEXT: s_add_i32 s12, s12, s17
-; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s2, 1
; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0
@@ -4217,7 +4211,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT: user_sgpr_count = 14
+; GFX10-NEXT: user_sgpr_count = 12
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4232,7 +4226,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_sgpr_queue_ptr = 1
; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GFX10-NEXT: enable_sgpr_dispatch_id = 1
-; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1
+; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0
; GFX10-NEXT: enable_sgpr_private_segment_size = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4370,7 +4364,7 @@ entry:
ret void
}
-define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %sel) {
+define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %sel) #0 {
; GPRIDX-LABEL: dyn_extract_v4f64_s_s_s:
; GPRIDX: .amd_kernel_code_t
; GPRIDX-NEXT: amd_code_version_major = 1
@@ -4393,7 +4387,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT: user_sgpr_count = 14
+; GPRIDX-NEXT: user_sgpr_count = 12
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4408,7 +4402,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1
; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1
+; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4425,7 +4419,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 16
+; GPRIDX-NEXT: wavefront_sgpr_count = 14
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -4469,7 +4463,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4480,7 +4474,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT: user_sgpr_count = 14
+; MOVREL-NEXT: user_sgpr_count = 12
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4495,7 +4489,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_sgpr_queue_ptr = 1
; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; MOVREL-NEXT: enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1
+; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0
; MOVREL-NEXT: enable_sgpr_private_segment_size = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4512,7 +4506,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 24
+; MOVREL-NEXT: wavefront_sgpr_count = 10
; MOVREL-NEXT: workitem_vgpr_count = 4
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
@@ -4530,12 +4524,10 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dword s6, s[8:9], 0x8
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; MOVREL-NEXT: s_add_i32 s12, s12, s17
-; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; MOVREL-NEXT: s_mov_b32 s2, 0
+; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s6, 1
-; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
; MOVREL-NEXT: s_cmp_eq_u32 s6, 2
; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
@@ -4543,7 +4535,6 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
; MOVREL-NEXT: v_mov_b32_e32 v0, s2
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
-; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13
; MOVREL-NEXT: v_mov_b32_e32 v1, s3
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -4571,7 +4562,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT: user_sgpr_count = 14
+; GFX10-NEXT: user_sgpr_count = 12
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4586,7 +4577,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_sgpr_queue_ptr = 1
; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GFX10-NEXT: enable_sgpr_dispatch_id = 1
-; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1
+; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0
; GFX10-NEXT: enable_sgpr_private_segment_size = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4868,3 +4859,5 @@ define i32 @v_extract_v64i32_37(ptr addrspace(1) %ptr) {
%elt = extractelement <64 x i32> %vec, i32 37
ret i32 %elt
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 64cdf577a3db9..86766e2904619 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -121,15 +121,12 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
ret void
}
-define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
+define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
; GFX8V4-LABEL: llvm_amdgcn_is_shared:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40
-; GFX8V4-NEXT: s_add_i32 s12, s12, s17
-; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0
@@ -143,9 +140,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc
-; GFX8V5-NEXT: s_add_i32 s12, s12, s17
-; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0
@@ -183,15 +177,12 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
ret void
}
-define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
+define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
; GFX8V4-LABEL: llvm_amdgcn_is_private:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44
-; GFX8V4-NEXT: s_add_i32 s12, s12, s17
-; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0
@@ -205,9 +196,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8
-; GFX8V5-NEXT: s_add_i32 s12, s12, s17
-; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0
@@ -245,7 +233,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
ret void
}
-define amdgpu_kernel void @llvm_trap() {
+define amdgpu_kernel void @llvm_trap() #0 {
; GFX8V4-LABEL: llvm_trap:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_mov_b64 s[0:1], s[6:7]
@@ -268,7 +256,7 @@ define amdgpu_kernel void @llvm_trap() {
unreachable
}
-define amdgpu_kernel void @llvm_debugtrap() {
+define amdgpu_kernel void @llvm_debugtrap() #0 {
; GFX8V4-LABEL: llvm_debugtrap:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_trap 3
@@ -288,13 +276,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
unreachable
}
-define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
+define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V4: ; %bb.0:
-; GFX8V4-NEXT: s_add_i32 s12, s12, s17
; GFX8V4-NEXT: v_mov_b32_e32 v0, s6
-; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8V4-NEXT: v_mov_b32_e32 v1, s7
; GFX8V4-NEXT: s_add_u32 s0, s8, 8
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -320,10 +305,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V5: ; %bb.0:
-; GFX8V5-NEXT: s_add_i32 s12, s12, s17
; GFX8V5-NEXT: v_mov_b32_e32 v0, s6
-; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8V5-NEXT: v_mov_b32_e32 v1, s7
; GFX8V5-NEXT: s_add_u32 s0, s8, 8
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -402,3 +384,5 @@ declare void @llvm.debugtrap()
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index 94853767ccfac..b025ec3e1da4d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[20:23], s[8:9], 0x0
; GCN-NEXT: s_load_dwordx2 s[24:25], s[8:9], 0x10
-; GCN-NEXT: s_add_u32 s0, s0, s17
+; GCN-NEXT: s_add_u32 s0, s0, s15
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: v_mov_b32_e32 v64, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -256,4 +256,4 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
ret void
}
-attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,10" }
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,10" "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
index 859f7ef16e395..11afcda8fbc53 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -11,16 +11,13 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace(
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 4
; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: s_add_i32 s12, s12, s17
; CHECK-NEXT: ds_read_b32 v2, v0
-; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-NEXT: v_mov_b32_e32 v3, 9
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_add_u32 s0, s0, 4
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: v_mov_b32_e32 v3, 9
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: v_mov_b32_e32 v0, 0x200
; CHECK-NEXT: ds_write_b32 v0, v3
@@ -34,4 +31,4 @@ entry:
ret void
}
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
index a5a75f74833f1..e719270c8620d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
@@ -1,14 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) {
+define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) #0 {
; GCN-LABEL: test_wave64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s2, s[8:9], 0x0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0xa
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 1, 0
@@ -28,3 +25,5 @@ entry:
}
declare i64 @llvm.amdgcn.if.break.i64(i1, i64)
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
index 1deee215e522b..35fb563844865 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
@@ -37,14 +37,11 @@ define double @v_trig_preop_f64_imm(double %a, i32 %b) {
ret double %result
}
-define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
+define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) #1 {
; CI-LABEL: s_trig_preop_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -62,9 +59,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -82,8 +76,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -93,10 +85,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
;
; GFX10-LABEL: s_trig_preop_f64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
@@ -121,13 +109,10 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
ret void
}
-define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
+define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) #1 {
; CI-LABEL: s_trig_preop_f64_imm:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; CI-NEXT: s_add_u32 s0, s0, 4
@@ -143,9 +128,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
; VI-LABEL: s_trig_preop_f64_imm:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; VI-NEXT: s_add_u32 s0, s0, 4
@@ -161,8 +143,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
; GFX9-LABEL: s_trig_preop_f64_imm:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
@@ -171,10 +151,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
;
; GFX10-LABEL: s_trig_preop_f64_imm:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
@@ -198,3 +174,4 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
declare double @llvm.amdgcn.trig.preop.f64(double, i32) #0
attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index b59f85b2dfa38..0817f325a7fe5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -3,13 +3,10 @@
; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) {
+define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) #0 {
; GFX8-LABEL: sdivrem_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s6, s5, 31
; GFX8-NEXT: s_add_i32 s0, s5, s6
@@ -145,13 +142,10 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
+define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) #0 {
; GFX8-LABEL: sdivrem_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s2, s9, 31
; GFX8-NEXT: s_ashr_i32 s12, s11, 31
@@ -619,13 +613,10 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) #0 {
; GFX8-LABEL: sdivrem_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s2, s10, 31
; GFX8-NEXT: s_add_i32 s0, s10, s2
@@ -851,12 +842,9 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) #0 {
; GFX8-LABEL: sdivrem_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1280,12 +1268,9 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) #0 {
; GFX8-LABEL: sdivrem_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2198,13 +2183,10 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) {
+define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) #0 {
; GFX8-LABEL: sdiv_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -2346,13 +2328,10 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) {
+define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) #0 {
; GFX8-LABEL: sdivrem_v2i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x10
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010
; GFX8-NEXT: s_ashr_i32 s3, s0, 31
@@ -2613,13 +2592,10 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) {
+define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) #0 {
; GFX8-LABEL: sdiv_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -2761,13 +2737,10 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
ret void
}
-define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) {
+define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) #0 {
; GFX8-LABEL: sdivrem_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x10
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sext_i32_i16 s0, s3
; GFX8-NEXT: s_ashr_i32 s10, s0, 31
@@ -3025,13 +2998,10 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) {
+define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) #0 {
; GFX8-LABEL: sdivrem_i3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -3179,13 +3149,10 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
ret void
}
-define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) {
+define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) #0 {
; GFX8-LABEL: sdivrem_i27:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -3332,3 +3299,5 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
store i27 %rem, ptr addrspace(1) %out1
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index ff0114cfc3ddb..4e8d82003ddb3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -3,13 +3,10 @@
; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) {
+define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) #0 {
; GFX8-LABEL: udivrem_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
; GFX8-NEXT: s_sub_i32 s0, 0, s5
@@ -112,13 +109,10 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
+define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) #0 {
; GFX8-LABEL: udivrem_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10
@@ -525,13 +519,10 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) #0 {
; GFX8-LABEL: udivrem_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11
@@ -691,12 +682,9 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) #0 {
; GFX8-LABEL: udivrem_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -989,12 +977,9 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) #0 {
; GFX8-LABEL: udivrem_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x20
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1783,13 +1768,10 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) {
+define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) #0 {
; GFX8-LABEL: udiv_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
@@ -1898,14 +1880,11 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) {
+define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) #0 {
; GFX8-LABEL: udivrem_v2i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s0, s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
@@ -2098,13 +2077,10 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) {
+define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) #0 {
; GFX8-LABEL: udiv_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s5, s4, 16
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
@@ -2213,14 +2189,11 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
ret void
}
-define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) {
+define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) #0 {
; GFX8-LABEL: udivrem_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s2, s1, 0xffff
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
@@ -2410,13 +2383,10 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) {
+define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) #0 {
; GFX8-LABEL: udivrem_i3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
@@ -2531,13 +2501,10 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
ret void
}
-define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) {
+define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) #0 {
; GFX8-LABEL: udivrem_i27:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
@@ -2651,3 +2618,5 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
store i27 %rem, ptr addrspace(1) %out1
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 7a7863462357b..653e17f75df11 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -93,7 +93,7 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
}
; Function is marked with amdgpu-no-workitem-id-* but uses them anyway
-define void @marked_func_use_workitem_id(ptr addrspace(1) %ptr) #0 {
+define void @marked_func_use_workitem_id(ptr addrspace(1) %ptr) #1 {
; FIXEDABI-SDAG-LABEL: marked_func_use_workitem_id:
; FIXEDABI-SDAG: ; %bb.0:
; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -131,13 +131,10 @@ define void @marked_func_use_workitem_id(ptr addrspace(1) %ptr) #0 {
}
; Function is marked with amdgpu-no-workitem-id-* but uses them anyway
-define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr) #0 {
+define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr) #1 {
; FIXEDABI-LABEL: marked_kernel_use_workitem_id:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; FIXEDABI-NEXT: s_add_i32 s6, s6, s11
-; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7
-; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0)
; FIXEDABI-NEXT: v_mov_b32_e32 v4, s1
; FIXEDABI-NEXT: v_mov_b32_e32 v3, s0
@@ -157,7 +154,7 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr)
ret void
}
-define void @marked_func_use_workgroup_id(ptr addrspace(1) %ptr) #0 {
+define void @marked_func_use_workgroup_id(ptr addrspace(1) %ptr) #1 {
; FIXEDABI-LABEL: marked_func_use_workgroup_id:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -180,23 +177,20 @@ define void @marked_func_use_workgroup_id(ptr addrspace(1) %ptr) #0 {
ret void
}
-define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr) #0 {
+define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr) #1 {
; FIXEDABI-LABEL: marked_kernel_use_workgroup_id:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; FIXEDABI-NEXT: s_add_i32 s6, s6, s11
-; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7
-; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s6
; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0)
; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0
; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1
; FIXEDABI-NEXT: flat_store_dword v[0:1], v2
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s9
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s7
; FIXEDABI-NEXT: flat_store_dword v[0:1], v2
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s10
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8
; FIXEDABI-NEXT: flat_store_dword v[0:1], v2
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
; FIXEDABI-NEXT: s_endpgm
@@ -209,7 +203,7 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr)
ret void
}
-define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
+define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #1 {
; FIXEDABI-LABEL: marked_func_use_other_sgpr:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -241,12 +235,9 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
ret void
}
-define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
+define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #1 {
; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
; FIXEDABI: ; %bb.0:
-; FIXEDABI-NEXT: s_add_i32 s6, s6, s11
-; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7
-; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; FIXEDABI-NEXT: s_add_u32 s0, s4, 8
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0
@@ -267,13 +258,10 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
ret void
}
-define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 {
+define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #1 {
; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr:
; FIXEDABI: ; %bb.0:
-; FIXEDABI-NEXT: s_add_i32 s4, s4, s9
; FIXEDABI-NEXT: v_mov_b32_e32 v0, 0
-; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s5
-; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-NEXT: v_mov_b32_e32 v1, 0
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
; FIXEDABI-NEXT: s_endpgm
@@ -338,7 +326,7 @@ define void @addrspacecast_requires_queue_ptr(ptr addrspace(5) %ptr.private, ptr
ret void
}
-define void @is_shared_requires_queue_ptr(ptr %ptr) #0 {
+define void @is_shared_requires_queue_ptr(ptr %ptr) #1 {
; FIXEDABI-LABEL: is_shared_requires_queue_ptr:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -356,7 +344,7 @@ define void @is_shared_requires_queue_ptr(ptr %ptr) #0 {
ret void
}
-define void @is_private_requires_queue_ptr(ptr %ptr) #0 {
+define void @is_private_requires_queue_ptr(ptr %ptr) #1 {
; FIXEDABI-LABEL: is_private_requires_queue_ptr:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -374,7 +362,7 @@ define void @is_private_requires_queue_ptr(ptr %ptr) #0 {
ret void
}
-define void @trap_requires_queue() #0 {
+define void @trap_requires_queue() #1 {
; FIXEDABI-LABEL: trap_requires_queue:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -386,7 +374,7 @@ define void @trap_requires_queue() #0 {
unreachable
}
-define void @debugtrap_requires_queue() #0 {
+define void @debugtrap_requires_queue() #1 {
; FIXEDABI-LABEL: debugtrap_requires_queue:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -412,5 +400,7 @@ declare void @llvm.debugtrap()
attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-work-group-id-x" "amdgpu-no-work-group-id-y" "amdgpu-no-work-group-id-z" "amdgpu-no-work-item-id-x" "amdgpu-no-work-item-id-y" "amdgpu-no-work-item-id-z" }
+attributes #1 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-work-group-id-x" "amdgpu-no-work-group-id-y" "amdgpu-no-work-group-id-z" "amdgpu-no-work-item-id-x" "amdgpu-no-work-item-id-y" "amdgpu-no-work-item-id-z" }
+
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index 4e7022710c671..b88a616b80ef2 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -4,14 +4,12 @@
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.readfirstlane(i32)
-define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapture readonly, ptr addrspace(1) noalias nocapture readonly) {
+define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapture readonly, ptr addrspace(1) noalias nocapture readonly) #0 {
; GCN-LABEL: readfirstlane_uniform:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_mov_b32 s5, 0
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s0, s0, s4
@@ -20,7 +18,6 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
; GCN-NEXT: s_add_u32 s0, s2, 40
; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
@@ -35,3 +32,5 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
store float %val, ptr addrspace(1) %gep1, align 4
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index e71bf15384727..755449b3bce7b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -384,7 +384,7 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) {
ret i32 %op
}
-define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
+define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) #1 {
; IR-LABEL: @select_add_lhs_const_i16(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 128, i16 131
; IR-NEXT: store i16 [[OP]], ptr addrspace(1) poison, align 2
@@ -393,9 +393,6 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
; GCN-LABEL: select_add_lhs_const_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s0, 0
; GCN-NEXT: s_movk_i32 s0, 0x80
@@ -513,3 +510,4 @@ define <2 x half> @multi_use_cast_regression(i1 %cond) {
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #0
attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index a4fe7121e347d..85b5c7c870b23 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -2,8 +2,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE
; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
-; TRAP-HANDLER-ENABLE: NumSgprs: 67
-; TRAP-HANDLER-DISABLE: NumSgprs: 83
+; TRAP-HANDLER-ENABLE: NumSgprs: 61
+; TRAP-HANDLER-DISABLE: NumSgprs: 77
define amdgpu_kernel void @amdhsa_trap_num_sgprs(
ptr addrspace(1) %out0, i32 %in0,
ptr addrspace(1) %out1, i32 %in1,
@@ -34,7 +34,7 @@ define amdgpu_kernel void @amdhsa_trap_num_sgprs(
ptr addrspace(1) %out26, i32 %in26,
ptr addrspace(1) %out27, i32 %in27,
ptr addrspace(1) %out28, i32 %in28,
- ptr addrspace(1) %out29, i32 %in29) {
+ ptr addrspace(1) %out29, i32 %in29) #0 {
entry:
store i32 %in0, ptr addrspace(1) %out0
store i32 %in1, ptr addrspace(1) %out1
@@ -68,3 +68,5 @@ entry:
store i32 %in29, ptr addrspace(1) %out29
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 4507fd5865989..aec619b837a1f 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -10,7 +10,7 @@ define amdgpu_kernel void @empty_exactly_1() #0 {
entry:
ret void
}
-attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,64" }
+attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-flat-scratch-init"}
; Exactly 5 waves per execution unit.
; CHECK-LABEL: {{^}}empty_exactly_5:
@@ -22,7 +22,7 @@ define amdgpu_kernel void @empty_exactly_5() #1 {
entry:
ret void
}
-attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
+attributes #1 = {"amdgpu-waves-per-eu"="5,5" "amdgpu-no-flat-scratch-init"}
; Exactly 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_exactly_10:
@@ -34,7 +34,7 @@ define amdgpu_kernel void @empty_exactly_10() #2 {
entry:
ret void
}
-attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
+attributes #2 = {"amdgpu-waves-per-eu"="10,10" "amdgpu-no-flat-scratch-init"}
; At least 1 wave per execution unit.
; CHECK-LABEL: {{^}}empty_at_least_1:
@@ -46,7 +46,7 @@ define amdgpu_kernel void @empty_at_least_1() #3 {
entry:
ret void
}
-attributes #3 = {"amdgpu-waves-per-eu"="1"}
+attributes #3 = {"amdgpu-waves-per-eu"="1" "amdgpu-no-flat-scratch-init"}
; At least 5 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_least_5:
@@ -58,7 +58,7 @@ define amdgpu_kernel void @empty_at_least_5() #4 {
entry:
ret void
}
-attributes #4 = {"amdgpu-waves-per-eu"="5"}
+attributes #4 = {"amdgpu-waves-per-eu"="5" "amdgpu-no-flat-scratch-init"}
; At least 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_least_10:
@@ -70,7 +70,7 @@ define amdgpu_kernel void @empty_at_least_10() #5 {
entry:
ret void
}
-attributes #5 = {"amdgpu-waves-per-eu"="10"}
+attributes #5 = {"amdgpu-waves-per-eu"="10" "amdgpu-no-flat-scratch-init"}
; At most 1 wave per execution unit (same as @empty_exactly_1).
@@ -84,7 +84,7 @@ define amdgpu_kernel void @empty_at_most_5() #6 {
entry:
ret void
}
-attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64"}
+attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-flat-scratch-init"}
; At most 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_most_10:
@@ -96,7 +96,7 @@ define amdgpu_kernel void @empty_at_most_10() #7 {
entry:
ret void
}
-attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
+attributes #7 = {"amdgpu-waves-per-eu"="1,10" "amdgpu-no-flat-scratch-init"}
; Between 1 and 5 waves per execution unit (same as @empty_at_most_5).
@@ -110,15 +110,15 @@ define amdgpu_kernel void @empty_between_5_and_10() #8 {
entry:
ret void
}
-attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
+attributes #8 = {"amdgpu-waves-per-eu"="5,10" "amdgpu-no-flat-scratch-init"}
@var = addrspace(1) global float 0.0
; Exactly 10 waves per execution unit.
; CHECK-LABEL: {{^}}exactly_10:
-; CHECK: SGPRBlocks: 3
+; CHECK: SGPRBlocks: 2
; CHECK: VGPRBlocks: 5
-; CHECK: NumSGPRsForWavesPerEU: 30
+; CHECK: NumSGPRsForWavesPerEU: 20
; CHECK: NumVGPRsForWavesPerEU: 24
define amdgpu_kernel void @exactly_10() #9 {
%val0 = load volatile float, ptr addrspace(1) @var
@@ -187,7 +187,7 @@ define amdgpu_kernel void @exactly_10() #9 {
ret void
}
-attributes #9 = {"amdgpu-waves-per-eu"="10,10"}
+attributes #9 = {"amdgpu-waves-per-eu"="10,10" "amdgpu-no-flat-scratch-init"}
; Exactly 256 workitems and exactly 2 waves.
; CHECK-LABEL: {{^}}empty_workitems_exactly_256_waves_exactly_2:
@@ -199,4 +199,4 @@ define amdgpu_kernel void @empty_workitems_exactly_256_waves_exactly_2() #10 {
entry:
ret void
}
-attributes #10 = {"amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="2,2"}
+attributes #10 = {"amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="2,2" "amdgpu-no-flat-scratch-init"}
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid.ll
new file mode 100644
index 0000000000000..8b816dfc28728
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid.ll
@@ -0,0 +1,550 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+
+;
+; These functions all should have the attribute amdgpu-no-flat-scratch-init set if the AMDGPUAttributor
+; pass is run. Therefore the purpose is to test llc when the attribute is incorrectly missing.
+;
+;; tests of addrspacecast
+
+define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: without_private_to_flat_addrspacecast:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: without_private_to_flat_addrspacecast:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store volatile i32 0, ptr addrspace(5) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: without_private_to_flat_addrspacecast_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GFX9-NEXT: s_mov_b64 s[20:21], s[0:1]
+; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 s20, s20, s17
+; GFX9-NEXT: s_addc_u32 s21, s21, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: buffer_store_dword v0, v1, s[20:23], 0 offen
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: without_private_to_flat_addrspacecast_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GFX10-NEXT: s_mov_b64 s[20:21], s[0:1]
+; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_add_u32 s20, s20, s17
+; GFX10-NEXT: s_addc_u32 s21, s21, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: buffer_store_dword v0, v1, s[20:23], 0 offen
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+ store volatile i32 0, ptr addrspace(5) %ptr
+ ret void
+}
+
+define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: call_without_private_to_flat_addrspacecast:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s18, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[16:17]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[16:17]
+; GFX9-NEXT: s_add_u32 s16, s16, without_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s17, s17, without_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b32 s33, s18
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: call_without_private_to_flat_addrspacecast:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s18, s33
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
+; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s16
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_getpc_b64 s[16:17]
+; GFX10-NEXT: s_add_u32 s16, s16, without_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s17, s17, without_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v2, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX10-NEXT: v_writelane_b32 v2, s31, 1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s31, v2, 1
+; GFX10-NEXT: v_readlane_b32 s30, v2, 0
+; GFX10-NEXT: s_mov_b32 s32, s33
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_mov_b32 s33, s18
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+ ret void
+}
+
+define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: call_without_private_to_flat_addrspacecast_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT: s_add_u32 s0, s0, s17
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_load_dword s17, s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 s8, s8, 8
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: s_mov_b32 s13, s15
+; GFX9-NEXT: s_mov_b32 s12, s14
+; GFX9-NEXT: s_getpc_b64 s[14:15]
+; GFX9-NEXT: s_add_u32 s14, s14, without_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s15, s15, without_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s14, s16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s17
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: call_without_private_to_flat_addrspacecast_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_mov_b32 s32, 0
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-NEXT: s_add_u32 s0, s0, s17
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_load_dword s17, s[8:9], 0x0
+; GFX10-NEXT: s_add_u32 s8, s8, 8
+; GFX10-NEXT: s_addc_u32 s9, s9, 0
+; GFX10-NEXT: s_mov_b32 s13, s15
+; GFX10-NEXT: s_mov_b32 s12, s14
+; GFX10-NEXT: s_getpc_b64 s[14:15]
+; GFX10-NEXT: s_add_u32 s14, s14, without_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s15, s15, without_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX10-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT: s_mov_b32 s14, s16
+; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s17
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX10-NEXT: s_endpgm
+ call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+ ret void
+}
+
+define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: call_call_without_private_to_flat_addrspacecast:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s19, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[16:17]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[16:17]
+; GFX9-NEXT: s_add_u32 s16, s16, call_without_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s17, s17, call_without_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT: v_writelane_b32 v3, s30, 0
+; GFX9-NEXT: v_writelane_b32 v3, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: v_readlane_b32 s31, v3, 1
+; GFX9-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b32 s33, s19
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: call_call_without_private_to_flat_addrspacecast:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s19, s33
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
+; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s16
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_getpc_b64 s[16:17]
+; GFX10-NEXT: s_add_u32 s16, s16, call_without_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s17, s17, call_without_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v3, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX10-NEXT: v_writelane_b32 v3, s31, 1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s31, v3, 1
+; GFX10-NEXT: v_readlane_b32 s30, v3, 0
+; GFX10-NEXT: s_mov_b32 s32, s33
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_mov_b32 s33, s19
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+ ret void
+}
+
+define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: call_call_without_private_to_flat_addrspacecast_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT: s_add_u32 s0, s0, s17
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_load_dword s17, s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 s8, s8, 8
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: s_mov_b32 s13, s15
+; GFX9-NEXT: s_mov_b32 s12, s14
+; GFX9-NEXT: s_getpc_b64 s[14:15]
+; GFX9-NEXT: s_add_u32 s14, s14, call_without_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s15, s15, call_without_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s14, s16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s17
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: call_call_without_private_to_flat_addrspacecast_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_mov_b32 s32, 0
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-NEXT: s_add_u32 s0, s0, s17
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_load_dword s17, s[8:9], 0x0
+; GFX10-NEXT: s_add_u32 s8, s8, 8
+; GFX10-NEXT: s_addc_u32 s9, s9, 0
+; GFX10-NEXT: s_mov_b32 s13, s15
+; GFX10-NEXT: s_mov_b32 s12, s14
+; GFX10-NEXT: s_getpc_b64 s[14:15]
+; GFX10-NEXT: s_add_u32 s14, s14, call_without_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s15, s15, call_without_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX10-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT: s_mov_b32 s14, s16
+; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s17
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX10-NEXT: s_endpgm
+ call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+ ret void
+}
+
+;; tests of indirect call, intrinsics, inline asm
+
+ at gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+
+define void @empty() {
+; GFX9-LABEL: empty:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: empty:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ ret void
+}
+
+define void @also_empty() {
+; GFX9-LABEL: also_empty:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: also_empty:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ ret void
+}
+
+define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) {
+; GFX9-LABEL: indirect_call_known_callees:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT: s_add_u32 s0, s0, s17
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_mov_b32 s13, s15
+; GFX9-NEXT: s_mov_b32 s12, s14
+; GFX9-NEXT: s_load_dword s17, s[8:9], 0x0
+; GFX9-NEXT: s_getpc_b64 s[14:15]
+; GFX9-NEXT: s_add_u32 s14, s14, empty at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s15, s15, empty at gotpcrel32@hi+12
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, also_empty at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, also_empty at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[22:23], s[14:15], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_and_b32 s14, 1, s17
+; GFX9-NEXT: s_cmp_eq_u32 s14, 1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_cselect_b32 s19, s23, s21
+; GFX9-NEXT: s_cselect_b32 s18, s22, s20
+; GFX9-NEXT: s_add_u32 s8, s8, 8
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s14, s16
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: indirect_call_known_callees:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_mov_b32 s32, 0
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-NEXT: s_add_u32 s0, s0, s17
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_mov_b32 s13, s15
+; GFX10-NEXT: s_mov_b32 s12, s14
+; GFX10-NEXT: s_getpc_b64 s[14:15]
+; GFX10-NEXT: s_add_u32 s14, s14, also_empty at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s15, s15, also_empty at gotpcrel32@hi+12
+; GFX10-NEXT: s_getpc_b64 s[18:19]
+; GFX10-NEXT: s_add_u32 s18, s18, empty at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s19, s19, empty at gotpcrel32@hi+12
+; GFX10-NEXT: s_load_dword s17, s[8:9], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[20:21], s[14:15], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[22:23], s[18:19], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_and_b32 s14, 1, s17
+; GFX10-NEXT: s_cmp_eq_u32 s14, 1
+; GFX10-NEXT: s_mov_b32 s14, s16
+; GFX10-NEXT: s_cselect_b32 s19, s23, s21
+; GFX10-NEXT: s_cselect_b32 s18, s22, s20
+; GFX10-NEXT: s_add_u32 s8, s8, 8
+; GFX10-NEXT: s_addc_u32 s9, s9, 0
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX10-NEXT: s_endpgm
+ %fptr = select i1 %cond, ptr @empty, ptr @also_empty
+ call void %fptr()
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+
+define void @use_intrinsic_workitem_id_x() {
+; GFX9-LABEL: use_intrinsic_workitem_id_x:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_and_b32_e32 v2, 0x3ff, v31
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: use_intrinsic_workitem_id_x:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_and_b32_e32 v2, 0x3ff, v31
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %val = call i32 @llvm.amdgcn.workitem.id.x()
+ store volatile i32 %val, ptr addrspace(1) null
+ ret void
+}
+
+define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
+; GFX9-LABEL: use_intrinsic_workitem_id_x_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_store_dword v[1:2], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: use_intrinsic_workitem_id_x_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: global_store_dword v[1:2], v0, off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+ %val = call i32 @llvm.amdgcn.workitem.id.x()
+ store volatile i32 %val, ptr addrspace(1) null
+ ret void
+}
+
+define void @call_use_intrinsic_workitem_id_x() {
+; GFX9-LABEL: call_use_intrinsic_workitem_id_x:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s18, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[16:17]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[16:17]
+; GFX9-NEXT: s_add_u32 s16, s16, use_intrinsic_workitem_id_x at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s17, s17, use_intrinsic_workitem_id_x at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT: v_writelane_b32 v3, s30, 0
+; GFX9-NEXT: v_writelane_b32 v3, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: v_readlane_b32 s31, v3, 1
+; GFX9-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b32 s33, s18
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: call_use_intrinsic_workitem_id_x:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s18, s33
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
+; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s16
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_getpc_b64 s[16:17]
+; GFX10-NEXT: s_add_u32 s16, s16, use_intrinsic_workitem_id_x at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s17, s17, use_intrinsic_workitem_id_x at gotpcrel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v3, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX10-NEXT: v_writelane_b32 v3, s31, 1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s31, v3, 1
+; GFX10-NEXT: v_readlane_b32 s30, v3, 0
+; GFX10-NEXT: s_mov_b32 s32, s33
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_mov_b32 s33, s18
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ call void @use_intrinsic_workitem_id_x()
+ ret void
+}
+
+define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
+; GFX9-LABEL: call_use_intrinsic_workitem_id_x_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; GFX9-NEXT: s_add_u32 s0, s0, s17
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_mov_b32 s13, s15
+; GFX9-NEXT: s_mov_b32 s12, s14
+; GFX9-NEXT: s_getpc_b64 s[14:15]
+; GFX9-NEXT: s_add_u32 s14, s14, use_intrinsic_workitem_id_x at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s15, s15, use_intrinsic_workitem_id_x at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s14, s16
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: call_use_intrinsic_workitem_id_x_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_mov_b32 s32, 0
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-NEXT: s_add_u32 s0, s0, s17
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_mov_b32 s13, s15
+; GFX10-NEXT: s_mov_b32 s12, s14
+; GFX10-NEXT: s_getpc_b64 s[14:15]
+; GFX10-NEXT: s_add_u32 s14, s14, use_intrinsic_workitem_id_x at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s15, s15, use_intrinsic_workitem_id_x at gotpcrel32@hi+12
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX10-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT: s_mov_b32 s14, s16
+; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX10-NEXT: s_endpgm
+ call void @use_intrinsic_workitem_id_x()
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index 748596d51c4ae..4f341fa71cf68 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -22,7 +22,7 @@
; NOOPT: .amdhsa_user_sgpr_queue_ptr 1
; NOOPT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; NOOPT: .amdhsa_user_sgpr_dispatch_id 1
-; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 1
+; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0
; NOOPT: .amdhsa_user_sgpr_private_segment_size 0
; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1
@@ -30,9 +30,11 @@
; NOOPT: .amdhsa_system_sgpr_workgroup_id_z 1
; NOOPT: .amdhsa_system_sgpr_workgroup_info 0
; NOOPT: .amdhsa_system_vgpr_workitem_id 2
-define amdgpu_kernel void @foo() {
+define amdgpu_kernel void @foo() #0 {
ret void
}
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
+
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION}
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 18f1e8e1dbd4b..3fe3cafd729a7 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -12,13 +12,13 @@
; OSABI-AMDHSA-ASM: .section .rodata,"a"
; OSABI-AMDHSA-ASM: .p2align 6
; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd
-; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 14
+; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 12
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 18
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 1
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
; OSABI-AMDHSA-ASM: .text
@@ -31,13 +31,13 @@
; OSABI-AMDHSA-ASM: .section .rodata,"a"
; OSABI-AMDHSA-ASM: .p2align 6
; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub
-; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 14
+; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 12
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 18
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 1
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
; OSABI-AMDHSA-ASM: .text
@@ -68,7 +68,7 @@
define amdgpu_kernel void @fadd(
ptr addrspace(1) %r,
ptr addrspace(1) %a,
- ptr addrspace(1) %b) {
+ ptr addrspace(1) %b) #0 {
entry:
%a.val = load float, ptr addrspace(1) %a
%b.val = load float, ptr addrspace(1) %b
@@ -80,7 +80,7 @@ entry:
define amdgpu_kernel void @fsub(
ptr addrspace(1) %r,
ptr addrspace(1) %a,
- ptr addrspace(1) %b) {
+ ptr addrspace(1) %b) #0 {
entry:
%a.val = load float, ptr addrspace(1) %a
%b.val = load float, ptr addrspace(1) %b
@@ -99,7 +99,9 @@ define amdgpu_kernel void @empty(
i32 %i,
ptr addrspace(1) %r,
ptr addrspace(1) %a,
- ptr addrspace(1) %b) {
+ ptr addrspace(1) %b) #0 {
entry:
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index c167834470e3b..6565dd5270b15 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -5,9 +5,6 @@
define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
; CHECK-LABEL: _Z11test_kernelPii:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-NEXT: s_add_i32 s12, s12, s17
-; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s0, 3
@@ -53,3 +50,5 @@ if.then: ; preds = %entry
if.end: ; preds = %if.then, %entry
ret void
}
+
+attributes #5 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
index fc17d9288bf40..a970a0c750c02 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
@@ -3,11 +3,9 @@
;
; This code is used to trigger the following dag node, with different return type and vector element type: i16 extract_vec_elt <N x i8> v, 0
-define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) {
+define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) #0 {
; CHECK-LABEL: eggs:
; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0
; CHECK-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8
; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -101,3 +99,5 @@ bb41: ; preds = %bb10, %bb
store <1 x i8> %tmp42, ptr %arg9
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index e6f02295e67d5..73640f2065012 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -90,13 +90,10 @@ bb:
ret i32 %i9
}
-define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
+define amdgpu_kernel void @s_add_co_br_user(i32 %i) #0 {
; GFX7-LABEL: s_add_co_br_user:
; GFX7: ; %bb.0: ; %bb
; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s0, s2, s2
; GFX7-NEXT: s_cmp_lt_u32 s0, s2
@@ -216,3 +213,5 @@ bb1:
store volatile i32 10, ptr addrspace(1) null
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index fac9f5bf826a6..761ba28ca2557 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -5,9 +5,6 @@
define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v1i8:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -21,9 +18,6 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -38,9 +32,6 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i
define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v2i8:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -63,9 +54,6 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -92,9 +80,6 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i
define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v3i8:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -117,9 +102,6 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -146,9 +128,6 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i
define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v4i8:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -171,9 +150,6 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -200,9 +176,6 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i
define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v8i8:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s0, s[8:9], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s0, 16
@@ -219,13 +192,10 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
; VI-LABEL: extract_vector_elt_v8i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v3
@@ -243,9 +213,6 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v16i8:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x4
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -268,9 +235,6 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -297,9 +261,6 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x
define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v32i8:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s0, s[8:9], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s0, 16
@@ -316,13 +277,10 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
; VI-LABEL: extract_vector_elt_v32i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v3
@@ -340,9 +298,6 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v64i8:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x10
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -365,9 +320,6 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x40
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -399,9 +351,6 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x
define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v2i8:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s2, s[8:9], 0xa
; SI-NEXT: s_load_dword s3, s[8:9], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -421,14 +370,11 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out
; VI-NEXT: s_load_dword s2, s[8:9], 0x4c
; VI-NEXT: s_load_dword s3, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s2, s2, 3
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_lshr_b32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -442,9 +388,6 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v3i8:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s2, s[8:9], 0x13
; SI-NEXT: s_load_dword s3, s[8:9], 0xa
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -463,13 +406,10 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out
; VI-NEXT: s_load_dword s2, s[8:9], 0x4c
; VI-NEXT: s_load_dword s3, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s2, s2, 3
; VI-NEXT: s_lshr_b32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -484,9 +424,6 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out
define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v4i8:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; SI-NEXT: s_load_dword s4, s[8:9], 0xc
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -505,9 +442,6 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -529,9 +463,6 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out
define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v8i8:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; SI-NEXT: s_load_dword s4, s[8:9], 0x4
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -550,9 +481,6 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -574,9 +502,6 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
; SI-LABEL: reduce_load_vector_v8i8_extract_0123:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -601,9 +526,6 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -636,9 +558,6 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
; SI-LABEL: reduce_load_vector_v8i8_extract_0145:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -662,9 +581,6 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s2, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -696,9 +612,6 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
; SI-LABEL: reduce_load_vector_v8i8_extract_45:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 4
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -715,9 +628,6 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 4
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -739,9 +649,6 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
; SI-LABEL: reduce_load_vector_v16i8_extract_0145:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -765,9 +672,6 @@ define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s2, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -796,4 +700,4 @@ define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
ret void
}
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init"}
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 2957d0201c223..055932a880fc4 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -9,14 +9,11 @@
; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
; unless isFabsFree returns true
-define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
+define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; CI-LABEL: s_fabs_free_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -29,9 +26,6 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -81,14 +75,11 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
ret void
}
-define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
+define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) #0 {
; CI-LABEL: s_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -101,9 +92,6 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -152,14 +140,11 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
ret void
}
-define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
+define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 {
; CI-LABEL: s_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -172,9 +157,6 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -210,13 +192,10 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
ret void
}
-define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
+define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) #0 {
; CI-LABEL: s_fabs_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
@@ -230,9 +209,6 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; VI-LABEL: s_fabs_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
@@ -271,13 +247,10 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
ret void
}
-define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) {
+define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) #0 {
; CI-LABEL: fabs_fold_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -295,9 +268,6 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s3
@@ -352,14 +322,11 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
ret void
}
-define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 #0 {
; CI-LABEL: v_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -374,9 +341,6 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -424,9 +388,6 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -439,9 +400,6 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -485,9 +443,6 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
@@ -514,9 +469,6 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -573,12 +525,9 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v0, v[0:1]
; CI-NEXT: s_lshr_b32 s2, s4, 16
@@ -604,12 +553,9 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -666,9 +612,6 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -692,9 +635,6 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -782,9 +722,6 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -803,9 +740,6 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -865,6 +799,5 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init"}
attributes #1 = { nounwind readnone }
-
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 60334e46a4454..c8a9f56ac6089 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -74,9 +74,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-LABEL: global_store_2xi16_align2:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
-; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17
-; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -93,9 +90,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
-; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17
-; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -222,10 +216,8 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
-; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17
-; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1
+; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -235,7 +227,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2
; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5
; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3
@@ -252,9 +243,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
-; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17
-; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -363,9 +351,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-LABEL: global_store_2xi16_align4:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
-; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17
-; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -376,9 +361,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
-; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17
-; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -425,6 +407,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
ret void
}
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 9919497acea73..c1752f2623a3f 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -24,9 +24,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
; GFX678-LABEL: v_test_canonicalize_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -79,9 +76,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s2, s[8:9], 0x2
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX6-NEXT: s_add_i32 s12, s12, s17
-; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX6-NEXT: v_mov_b32_e32 v0, s0
@@ -93,9 +87,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -141,9 +132,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fabs_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -196,9 +184,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -252,9 +237,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fneg_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -307,9 +289,6 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
; GFX678-LABEL: test_fold_canonicalize_undef_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -349,9 +328,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -391,9 +367,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_bfrev_b32_e32 v2, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -436,9 +409,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 1.0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -479,9 +449,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, -1.0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -522,9 +489,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
; GFX678-LABEL: test_fold_canonicalize_literal_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -565,9 +529,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -607,13 +568,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
-; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: v_mov_b32_e32 v1, s1
; GFX678-NEXT: flat_store_dword v[0:1], v2
; GFX678-NEXT: s_endpgm
@@ -654,13 +612,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
-; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: v_mov_b32_e32 v1, s1
; GFX678-NEXT: flat_store_dword v[0:1], v2
; GFX678-NEXT: s_endpgm
@@ -701,13 +656,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
-; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: v_mov_b32_e32 v1, s1
; GFX678-NEXT: flat_store_dword v[0:1], v2
; GFX678-NEXT: s_endpgm
@@ -748,9 +700,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -791,9 +740,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_bfrev_b32_e32 v2, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -836,9 +782,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -879,9 +822,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out
; GFX678-LABEL: test_fold_canonicalize_qnan_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -922,9 +862,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -965,9 +902,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1008,9 +942,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1051,9 +982,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1094,9 +1022,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1137,9 +1062,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1180,9 +1102,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1
; GFX678-LABEL: v_test_canonicalize_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1234,9 +1153,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
; GFX6-LABEL: s_test_canonicalize_var_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX6-NEXT: s_add_i32 s12, s12, s17
-; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3]
; GFX6-NEXT: v_mov_b32_e32 v0, s0
@@ -1247,9 +1163,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
; GFX8-LABEL: s_test_canonicalize_var_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s0
@@ -1292,9 +1205,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fabs_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1347,9 +1257,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1
; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1403,9 +1310,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fneg_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1458,13 +1362,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT: v_mov_b32_e32 v1, v0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, v0
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1506,13 +1407,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1552,13 +1450,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1596,13 +1491,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1640,13 +1532,10 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
; GFX678-LABEL: test_fold_canonicalize_literal_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1684,13 +1573,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX678-NEXT: v_mov_b32_e32 v1, v0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, v0
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1732,13 +1618,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, -1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1779,13 +1662,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1825,13 +1705,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, -1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1872,13 +1749,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
; GFX678-LABEL: test_fold_canonicalize_qnan_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1916,13 +1790,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1960,13 +1831,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -2004,13 +1872,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -2048,13 +1913,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -2092,13 +1954,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -2136,13 +1995,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX678-NEXT: s_add_i32 s12, s12, s17
-; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -2181,9 +2037,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX6-NEXT: s_add_i32 s12, s12, s17
-; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2201,9 +2054,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2267,9 +2117,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX6-NEXT: s_add_i32 s12, s12, s17
-; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2287,9 +2134,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2353,9 +2197,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0
-; GFX6-NEXT: s_add_i32 s12, s12, s17
-; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2374,9 +2215,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2441,9 +2279,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX6-NEXT: s_add_i32 s12, s12, s17
-; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2467,9 +2302,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2536,9 +2368,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX6-NEXT: s_add_i32 s12, s12, s17
-; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2556,9 +2385,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2622,9 +2448,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX6-NEXT: s_add_i32 s12, s12, s17
-; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2642,9 +2465,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2709,9 +2529,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0
-; GFX6-NEXT: s_add_i32 s12, s12, s17
-; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2730,9 +2547,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2798,9 +2612,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX6-NEXT: s_add_i32 s12, s12, s17
-; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2824,9 +2635,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2892,9 +2700,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX6-NEXT: s_add_i32 s12, s12, s17
-; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -2912,9 +2717,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -3267,10 +3069,10 @@ define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 {
}
attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
-attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
-attributes #3 = { nounwind "denormal-fp-math"="ieee,ieee" }
-attributes #4 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
-attributes #5 = { nounwind "denormal-fp-math-f32"="dynamic,dynamic" }
-attributes #6 = { nounwind "denormal-fp-math-f32"="dynamic,ieee" }
-attributes #7 = { nounwind "denormal-fp-math-f32"="ieee,dynamic" }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-no-flat-scratch-init" }
+attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-no-flat-scratch-init" }
+attributes #3 = { nounwind "denormal-fp-math"="ieee,ieee" "amdgpu-no-flat-scratch-init" }
+attributes #4 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-no-flat-scratch-init" }
+attributes #5 = { nounwind "denormal-fp-math-f32"="dynamic,dynamic" "amdgpu-no-flat-scratch-init" }
+attributes #6 = { nounwind "denormal-fp-math-f32"="dynamic,ieee" "amdgpu-no-flat-scratch-init" }
+attributes #7 = { nounwind "denormal-fp-math-f32"="ieee,dynamic" "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index 513befe6e19e5..fc316b736d5f1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -11,19 +11,18 @@
; ALL-LABEL: {{^}}test:
-; HSA-DEFAULT: flat_scr
-; HSA-NODEFAULT-NOT: flat_scr
+; ALL-NOT: flat_scr
; HSA-DEFAULT: flat_store_dword
; HSA-NODEFAULT: buffer_store_dword
; HSA-NOADDR64: flat_store_dword
-; HSA: .amdhsa_user_sgpr_flat_scratch_init 1
+; HSA: .amdhsa_user_sgpr_flat_scratch_init 0
; NOHSA-DEFAULT: buffer_store_dword
; NOHSA-NODEFAULT: flat_store_dword
; NOHSA-NOADDR64: flat_store_dword
-define amdgpu_kernel void @test(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out) #0 {
entry:
store i32 0, ptr addrspace(1) %out
ret void
@@ -38,7 +37,7 @@ entry:
; NOHSA-DEFAULT: buffer_store_dword
; NOHSA-NODEFAULT: flat_store_dword
; NOHSA-NOADDR64: flat_store_dword
-define amdgpu_kernel void @test_addr64(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_addr64(ptr addrspace(1) %out) #0 {
entry:
%out.addr = alloca ptr addrspace(1), align 4, addrspace(5)
@@ -54,5 +53,7 @@ entry:
ret void
}
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
+
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index fb2448fb80744..10864507ee456 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -16,9 +16,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
; VI-LABEL: multiple_fadd_use_test_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f32_e64 v0, s3, -1.0
; VI-NEXT: v_add_f32_e64 v1, s2, -1.0
@@ -83,11 +80,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-NEXT: s_load_dword s3, s[8:9], 0x2c
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_add_u32 s2, s0, 4
; VI-NEXT: v_add_f32_e64 v2, s4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -145,9 +139,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
; VI-LABEL: multiple_use_fadd_fmad_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s4, s0, 4
@@ -203,9 +194,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s6, s4, 4
; VI-NEXT: v_mov_b32_e32 v0, s1
@@ -267,9 +255,6 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0
; VI-NEXT: v_mul_f32_e32 v2, s2, v0
@@ -318,13 +303,10 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-NEXT: v_mul_f32_e32 v2, s2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -368,9 +350,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
; VI-DENORM: ; %bb.0:
; VI-DENORM-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
-; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16
; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0
@@ -389,9 +368,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
; VI-FLUSH: ; %bb.0:
; VI-FLUSH-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
-; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16
; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0
@@ -506,9 +482,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
; VI-DENORM: ; %bb.0:
; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
-; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
@@ -530,9 +503,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
; VI-FLUSH: ; %bb.0:
; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
-; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
@@ -629,9 +599,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
; VI-DENORM: ; %bb.0:
; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
-; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
@@ -653,9 +620,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
; VI-FLUSH: ; %bb.0:
; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
-; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3
@@ -754,8 +718,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; VI-DENORM-NEXT: s_load_dword s6, s[8:9], 0x8
-; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
-; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
@@ -763,7 +725,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s1
; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
-; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-DENORM-NEXT: s_add_u32 s4, s2, 2
; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0
@@ -780,8 +741,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; VI-FLUSH-NEXT: s_load_dword s6, s[8:9], 0x8
-; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
-; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
@@ -789,7 +748,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s1
; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2
; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0
@@ -889,9 +847,6 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0
; VI-NEXT: v_mul_f16_e32 v2, s2, v0
@@ -943,13 +898,10 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 0xc600
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f16_e32 v0, s2, v0
; VI-NEXT: v_mul_f16_e32 v2, s2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -991,5 +943,5 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
ret void
}
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "unsafe-fp-math"="true" "amdgpu-no-flat-scratch-init" }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index eb9eb42df4c78..ec57d1ea3d8d2 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -5,13 +5,10 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
-define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) {
+define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) #0 {
; CI-LABEL: fneg_fabs_fadd_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -29,9 +26,6 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -87,13 +81,10 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
ret void
}
-define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) {
+define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) #0 {
; CI-LABEL: fneg_fabs_fmul_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s1, s0, 0x7fff
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -112,9 +103,6 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -173,14 +161,11 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
; DAGCombiner will transform:
; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
; unless isFabsFree returns true
-define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; CI-LABEL: fneg_fabs_free_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_bitset1_b32 s2, 15
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -193,9 +178,6 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s2, 15
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -246,14 +228,11 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
ret void
}
-define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
+define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) #0 {
; CI-LABEL: fneg_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_bitset1_b32 s2, 15
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -266,9 +245,6 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s2, 15
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -318,13 +294,10 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
ret void
}
-define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; CIVI-LABEL: v_fneg_fabs_f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -375,13 +348,10 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
ret void
}
-define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) #0 {
; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
@@ -404,9 +374,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0x4000
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v2, s3
@@ -415,7 +383,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -453,14 +420,11 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; FIXME: single bit op
; Combine turns this into integer op when bitcast source (from load)
-define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) #0 {
; CI-LABEL: s_fneg_fabs_v2f16_bc_src:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_or_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -473,9 +437,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_or_b32 s2, s2, 0x80008000
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -512,13 +473,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
ret void
}
-define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
+define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) #0 {
; CIVI-LABEL: fneg_fabs_v4f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000
; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000
@@ -562,9 +520,6 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; CI-LABEL: fold_user_fneg_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s0, 16
; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
@@ -586,9 +541,7 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0xc400
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v2, s3
@@ -596,7 +549,6 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -631,14 +583,11 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
ret void
}
-define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) #0 {
; CI-LABEL: s_fneg_multi_use_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
@@ -656,9 +605,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
@@ -708,14 +654,11 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
ret void
}
-define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) #0 {
; CI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010
@@ -740,9 +683,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v5, 0xc400
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_lshr_b32 s1, s4, 16
@@ -751,7 +692,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
; VI-NEXT: v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_mul_f16_e64 v5, |s4|, -4.0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_or_b32_e32 v4, v5, v4
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -799,5 +739,5 @@ declare half @llvm.fabs.f16(half) #1
declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 058c273a65d99..0b97403968193 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -3,7 +3,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
-define i32 @fneg_xor_select_i32(i1 %cond, i32 %arg0, i32 %arg1) {
+define i32 @fneg_xor_select_i32(i1 %cond, i32 %arg0, i32 %arg1) #1 {
; GCN-LABEL: fneg_xor_select_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25,7 +25,7 @@ define i32 @fneg_xor_select_i32(i1 %cond, i32 %arg0, i32 %arg1) {
ret i32 %fneg
}
-define <2 x i32> @fneg_xor_select_v2i32(<2 x i1> %cond, <2 x i32> %arg0, <2 x i32> %arg1) {
+define <2 x i32> @fneg_xor_select_v2i32(<2 x i1> %cond, <2 x i32> %arg0, <2 x i32> %arg1) #1 {
; GCN-LABEL: fneg_xor_select_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -53,7 +53,7 @@ define <2 x i32> @fneg_xor_select_v2i32(<2 x i1> %cond, <2 x i32> %arg0, <2 x i3
ret <2 x i32> %fneg
}
-define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr addrspace(1) %ptr) {
+define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr addrspace(1) %ptr) #1 {
; GFX7-LABEL: fneg_xor_select_i32_multi_use:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -92,7 +92,7 @@ define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr ad
ret i32 %fneg
}
-define i64 @fneg_xor_select_i64(i1 %cond, i64 %arg0, i64 %arg1) {
+define i64 @fneg_xor_select_i64(i1 %cond, i64 %arg0, i64 %arg1) #1 {
; GCN-LABEL: fneg_xor_select_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -116,7 +116,7 @@ define i64 @fneg_xor_select_i64(i1 %cond, i64 %arg0, i64 %arg1) {
ret i64 %fneg
}
-define <2 x i64> @fneg_xor_select_v2i64(<2 x i1> %cond, <2 x i64> %arg0, <2 x i64> %arg1) {
+define <2 x i64> @fneg_xor_select_v2i64(<2 x i1> %cond, <2 x i64> %arg0, <2 x i64> %arg1) #1 {
; GCN-LABEL: fneg_xor_select_v2i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -148,7 +148,7 @@ define <2 x i64> @fneg_xor_select_v2i64(<2 x i1> %cond, <2 x i64> %arg0, <2 x i6
ret <2 x i64> %fneg
}
-define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) {
+define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) #1 {
; GCN-LABEL: fneg_xor_select_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -172,7 +172,7 @@ define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) {
ret i16 %fneg
}
-define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i16> %arg1) {
+define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i16> %arg1) #1 {
; GFX7-LABEL: fneg_xor_select_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -227,7 +227,7 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1
ret <2 x i16> %fneg
}
-define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr addrspace(1) %ptr) {
+define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr addrspace(1) %ptr) #1 {
; GFX7-LABEL: fneg_xor_select_i16_multi_use:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -266,7 +266,7 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
ret i16 %fneg
}
-define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr addrspace(1) %ptr) {
+define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr addrspace(1) %ptr) #1 {
; GFX7-LABEL: fneg_xor_select_i64_multi_user:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -310,7 +310,7 @@ define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr a
ret i64 %fneg
}
-define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg1) {
+define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg1) #1 {
; GCN-LABEL: select_fneg_xor_select_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -346,7 +346,7 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
ret i32 %select1
}
-define float @select_fneg_select_f32(i1 %cond0, i1 %cond1, float %arg0, float %arg1) {
+define float @select_fneg_select_f32(i1 %cond0, i1 %cond1, float %arg0, float %arg1) #1 {
; GCN-LABEL: select_fneg_select_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -377,7 +377,7 @@ define float @select_fneg_select_f32(i1 %cond0, i1 %cond1, float %arg0, float %a
ret float %select1
}
-define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
+define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) #1 {
; GCN-LABEL: fneg_xor_select_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -401,7 +401,7 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
ret double %fneg
}
-define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %arg1, ptr addrspace(1) %ptr) {
+define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %arg1, ptr addrspace(1) %ptr) #1 {
; GFX7-LABEL: fneg_xor_select_f64_multi_user:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -446,7 +446,7 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
ret double %fneg
}
-define double @fneg_xor_select_i64_user_with_srcmods(i1 %cond, i64 %arg0, i64 %arg1) {
+define double @fneg_xor_select_i64_user_with_srcmods(i1 %cond, i64 %arg0, i64 %arg1) #1 {
; GCN-LABEL: fneg_xor_select_i64_user_with_srcmods:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -473,7 +473,7 @@ define double @fneg_xor_select_i64_user_with_srcmods(i1 %cond, i64 %arg0, i64 %a
ret double %add
}
-define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, double %arg1) {
+define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, double %arg1) #1 {
; GCN-LABEL: select_fneg_select_fneg_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -511,7 +511,7 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
ret double %select1
}
-define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg1) {
+define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg1) #1 {
; GCN-LABEL: select_fneg_xor_select_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -549,7 +549,7 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
ret i64 %select1
}
-define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1) {
+define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1) #1 {
; GFX7-LABEL: select_fneg_select_f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -600,7 +600,7 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1
ret half %select1
}
-define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg1) {
+define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg1) #1 {
; GCN-LABEL: select_fneg_xor_select_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -636,7 +636,7 @@ define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg
ret i16 %select1
}
-define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 x half> %arg0, <2 x half> %arg1) {
+define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 x half> %arg0, <2 x half> %arg1) #1 {
; GFX7-LABEL: select_fneg_select_v2f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -733,7 +733,7 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2
ret <2 x half> %select1
}
-define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, <2 x i16> %arg0, <2 x i16> %arg1) {
+define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, <2 x i16> %arg0, <2 x i16> %arg1) #1 {
; GFX7-LABEL: select_fneg_xor_select_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -820,7 +820,7 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1,
; pattern that appeared in rocm-device-libs to manually operate on the
; sign bit of the high half of a double
-define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
+define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) #1 {
; GCN-LABEL: cospiD_pattern0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -861,7 +861,7 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
ret double %i11
}
-define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
+define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) #1 {
; GCN-LABEL: cospiD_pattern1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -899,7 +899,7 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
}
; artifical example, scaled to operation on 16-bit halves of a float.
-define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
+define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) #1 {
; GFX7-LABEL: cospiD_pattern0_half:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -958,7 +958,7 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
ret float %i11
}
-define float @cospiD_pattern1_half(i16 %arg, float %arg1, float %arg2) {
+define float @cospiD_pattern1_half(i16 %arg, float %arg1, float %arg2) #1 {
; GFX7-LABEL: cospiD_pattern1_half:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -999,7 +999,7 @@ define float @cospiD_pattern1_half(i16 %arg, float %arg1, float %arg2) {
ret float %i7
}
-define double @fneg_f64_bitcast_vector_i64_to_f64(i64 %arg) {
+define double @fneg_f64_bitcast_vector_i64_to_f64(i64 %arg) #1 {
; GCN-LABEL: fneg_f64_bitcast_vector_i64_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1016,7 +1016,7 @@ define double @fneg_f64_bitcast_vector_i64_to_f64(i64 %arg) {
ret double %fneg
}
-define double @fneg_f64_bitcast_vector_v2i32_to_f64(<2 x i32> %arg) {
+define double @fneg_f64_bitcast_vector_v2i32_to_f64(<2 x i32> %arg) #1 {
; GCN-LABEL: fneg_f64_bitcast_vector_v2i32_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1033,7 +1033,7 @@ define double @fneg_f64_bitcast_vector_v2i32_to_f64(<2 x i32> %arg) {
ret double %fneg
}
-define double @fneg_f64_bitcast_vector_v2f32_to_f64(<2 x float> %arg) {
+define double @fneg_f64_bitcast_vector_v2f32_to_f64(<2 x float> %arg) #1 {
; GCN-LABEL: fneg_f64_bitcast_vector_v2f32_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1050,7 +1050,7 @@ define double @fneg_f64_bitcast_vector_v2f32_to_f64(<2 x float> %arg) {
ret double %fneg
}
-define double @fneg_f64_bitcast_vector_v4i16_to_f64(<4 x i16> %arg) {
+define double @fneg_f64_bitcast_vector_v4i16_to_f64(<4 x i16> %arg) #1 {
; GFX7-LABEL: fneg_f64_bitcast_vector_v4i16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1079,7 +1079,7 @@ define double @fneg_f64_bitcast_vector_v4i16_to_f64(<4 x i16> %arg) {
ret double %fneg
}
-define double @fneg_f64_bitcast_vector_v4f16_to_f64(<4 x half> %arg) {
+define double @fneg_f64_bitcast_vector_v4f16_to_f64(<4 x half> %arg) #1 {
; GFX7-LABEL: fneg_f64_bitcast_vector_v4f16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1110,7 +1110,7 @@ define double @fneg_f64_bitcast_vector_v4f16_to_f64(<4 x half> %arg) {
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v2i32_to_f64(i32 %elt0, i32 %elt1) {
+define double @fneg_f64_bitcast_build_vector_v2i32_to_f64(i32 %elt0, i32 %elt1) #1 {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1129,7 +1129,7 @@ define double @fneg_f64_bitcast_build_vector_v2i32_to_f64(i32 %elt0, i32 %elt1)
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v2f32_to_f64(float %elt0, float %elt1) {
+define double @fneg_f64_bitcast_build_vector_v2f32_to_f64(float %elt0, float %elt1) #1 {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1148,7 +1148,7 @@ define double @fneg_f64_bitcast_build_vector_v2f32_to_f64(float %elt0, float %el
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v4i16_to_f64(i16 %elt0, i16 %elt1, i16 %elt2, i16 %elt3) {
+define double @fneg_f64_bitcast_build_vector_v4i16_to_f64(i16 %elt0, i16 %elt1, i16 %elt2, i16 %elt3) #1 {
; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4i16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1187,7 +1187,7 @@ define double @fneg_f64_bitcast_build_vector_v4i16_to_f64(i16 %elt0, i16 %elt1,
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1, half %elt2, half %elt3) {
+define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1, half %elt2, half %elt3) #1 {
; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4f16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1228,7 +1228,7 @@ define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat %elt1, bfloat %elt2, bfloat %elt3) {
+define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat %elt1, bfloat %elt2, bfloat %elt3) #1 {
; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1269,7 +1269,7 @@ define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user(i32 %elt0, i32 %elt1, double %fp.val) {
+define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user(i32 %elt0, i32 %elt1, double %fp.val) #1 {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1289,7 +1289,7 @@ define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user(i32 %elt
ret double %fmul
}
-define { double, double } @fneg_f64_bitcast_build_vector_v2i32_to_f64_multi_modifier_user(i32 %elt0, i32 %elt1, double %fp.val0, double %fp.val1) {
+define { double, double } @fneg_f64_bitcast_build_vector_v2i32_to_f64_multi_modifier_user(i32 %elt0, i32 %elt1, double %fp.val0, double %fp.val1) #1 {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64_multi_modifier_user:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1318,7 +1318,7 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2i32_to_f64_multi_modi
ret { double, double } %ret.1
}
-define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user_integer_neg_source(i32 %elt0, i32 %elt1, double %fp.val) {
+define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user_integer_neg_source(i32 %elt0, i32 %elt1, double %fp.val) #1 {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user_integer_neg_source:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1342,7 +1342,7 @@ define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user_integer_
ret double %fmul
}
-define double @fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64(float %elt0, float %elt1) {
+define double @fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64(float %elt0, float %elt1) #1 {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1362,7 +1362,7 @@ define double @fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64(float
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user(float %elt0, float %elt1, ptr addrspace(1) %ptr) {
+define double @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user(float %elt0, float %elt1, ptr addrspace(1) %ptr) #1 {
; GFX7-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1394,7 +1394,7 @@ define double @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user(fl
ret double %fneg
}
-define { double, <2 x float> } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_foldable_user(float %elt0, float %elt1, <2 x float> %arg.v2f32) {
+define { double, <2 x float> } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_foldable_user(float %elt0, float %elt1, <2 x float> %arg.v2f32) #1 {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_foldable_user:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1421,7 +1421,7 @@ define { double, <2 x float> } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitca
ret { double, <2 x float> } %ret.1
}
-define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_user(float %elt0, float %elt1) {
+define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_user(float %elt0, float %elt1) #1 {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_user:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1446,7 +1446,7 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_us
ret { double, double } %ret.1
}
-define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_foldable_user(float %elt0, float %elt1, double %arg.f64) {
+define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_foldable_user(float %elt0, float %elt1, double %arg.f64) #1 {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_foldable_user:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1471,14 +1471,12 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_fo
}
; Check for correct bitcasting back when there are multiple uses
-define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i1 %z, ptr addrspace(1) %dst) {
+define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i1 %z, ptr addrspace(1) %dst) #1 {
; GFX7-LABEL: multiple_uses_fneg_select_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x4
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x6
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_bitcmp1_b32 s6, 0
; GFX7-NEXT: s_cselect_b64 vcc, -1, 0
@@ -1490,7 +1488,6 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
; GFX7-NEXT: s_cselect_b32 s0, s0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v3, s5
@@ -1546,7 +1543,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
ret void
}
-define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) {
+define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) #1 {
; GCN-LABEL: fnge_select_f32_multi_use_regression:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
@@ -1601,3 +1598,5 @@ bb5: ; preds = %bb, %.entry
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #0
attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+attributes #1 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 98e0b27cd955d..92a349a4db19f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -11,9 +11,6 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x8000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -26,9 +23,6 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -84,9 +78,6 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -101,9 +92,6 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -164,9 +152,6 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x8000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -179,9 +164,6 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -235,9 +217,6 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
; CI-LABEL: v_fneg_fold_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -255,9 +234,6 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: v_fneg_fold_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -313,9 +289,6 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -328,9 +301,6 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -370,17 +340,14 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 {
; CIVI-LABEL: s_fneg_v2f16_nonload:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
; CIVI-NEXT: ;;#ASMSTART
; CIVI-NEXT: ; def s2
; CIVI-NEXT: ;;#ASMEND
; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: v_mov_b32_e32 v2, s2
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: v_mov_b32_e32 v1, s1
-; CIVI-NEXT: v_mov_b32_e32 v2, s2
; CIVI-NEXT: flat_store_dword v[0:1], v2
; CIVI-NEXT: s_endpgm
;
@@ -421,9 +388,6 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -438,9 +402,6 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -488,9 +449,6 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -503,9 +461,6 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -546,9 +501,6 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; CI-LABEL: v_fneg_fold_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -575,9 +527,6 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; GFX8-LABEL: v_fneg_fold_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -623,9 +572,6 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
; CI-LABEL: v_extract_fneg_fold_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -647,9 +593,6 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
; GFX8-LABEL: v_extract_fneg_fold_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -729,9 +672,6 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0
; CIVI-LABEL: v_extract_fneg_no_fold_v2f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
@@ -783,5 +723,5 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 10573aad38a51..5915f49658e55 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -10,9 +10,6 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -24,9 +21,6 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -52,9 +46,6 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -66,9 +57,6 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -93,9 +81,6 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
; CIVI-LABEL: load_v3f16_arg:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_add_u32 s4, s0, 4
; CIVI-NEXT: s_addc_u32 s5, s1, 0
@@ -129,9 +114,6 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
; CIVI-LABEL: load_v4f16_arg:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v2, s2
@@ -157,9 +139,6 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -174,9 +153,6 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -207,9 +183,6 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s3, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -223,9 +196,6 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -257,9 +227,6 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -271,9 +238,6 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -301,9 +265,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s3, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -317,9 +278,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -350,9 +308,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
; CI-LABEL: extload_v3f16_to_v3f32_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
@@ -366,9 +321,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
; VI-LABEL: extload_v3f16_to_v3f32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
@@ -399,9 +351,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; CI-LABEL: extload_v4f16_to_v4f32_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s3, 16
; CI-NEXT: s_lshr_b32 s5, s2, 16
@@ -417,9 +366,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; VI-LABEL: extload_v4f16_to_v4f32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s3, 16
; VI-NEXT: s_lshr_b32 s5, s2, 16
@@ -455,9 +401,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s6, s1, 16
; CI-NEXT: s_lshr_b32 s7, s0, 16
@@ -486,9 +429,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s6, s1, 16
; VI-NEXT: s_lshr_b32 s7, s0, 16
@@ -545,9 +485,6 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
; CI-LABEL: extload_f16_to_f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -561,9 +498,6 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
; VI-LABEL: extload_f16_to_f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x8
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -595,9 +529,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
; CI-LABEL: extload_v2f16_to_v2f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s1
@@ -614,9 +545,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
; VI-LABEL: extload_v2f16_to_v2f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x8
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s1
@@ -654,9 +582,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
; CI-LABEL: extload_v3f16_to_v3f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
; CI-NEXT: s_lshr_b32 s4, s2, 16
@@ -678,9 +603,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
; VI-LABEL: extload_v3f16_to_v3f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
; VI-NEXT: s_lshr_b32 s4, s2, 16
@@ -726,9 +648,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
; CI-LABEL: extload_v4f16_to_v4f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s3, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
@@ -754,9 +673,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
; VI-LABEL: extload_v4f16_to_v4f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s5, s3, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s3
@@ -810,9 +726,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s6, s3, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s6
@@ -860,9 +773,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s6, s0, 16
; VI-NEXT: s_lshr_b32 s8, s2, 16
@@ -948,9 +858,6 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr
; CIVI-LABEL: global_load_store_f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -979,9 +886,6 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: global_load_store_v2f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -1010,9 +914,6 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add
; CIVI-LABEL: global_load_store_v4f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
@@ -1041,9 +942,6 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: global_load_store_v8f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -1072,9 +970,6 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr
; CIVI-LABEL: global_extload_f16_to_f32:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -1106,9 +1001,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v2f16_to_v2f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1125,9 +1017,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v2f16_to_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1163,9 +1052,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v3f16_to_v3f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1183,9 +1069,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v3f16_to_v3f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1223,9 +1106,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v4f16_to_v4f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1245,9 +1125,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v4f16_to_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1288,9 +1165,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v8f16_to_v8f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1321,9 +1195,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v8f16_to_v8f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1380,9 +1251,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; CI-LABEL: global_extload_v16f16_to_v16f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s4, s2, 16
; CI-NEXT: v_mov_b32_e32 v5, s3
@@ -1441,9 +1309,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; VI-LABEL: global_extload_v16f16_to_v16f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1541,9 +1406,6 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr
; CIVI-LABEL: global_extload_f16_to_f64:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -1578,9 +1440,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v2f16_to_v2f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1599,9 +1458,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v2f16_to_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1642,9 +1498,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v3f16_to_v3f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1670,9 +1523,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v3f16_to_v3f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1724,9 +1574,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v4f16_to_v4f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1755,9 +1602,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v4f16_to_v4f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1815,9 +1659,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v8f16_to_v8f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1866,9 +1707,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v8f16_to_v8f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1953,9 +1791,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; CI-LABEL: global_extload_v16f16_to_v16f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2050,9 +1885,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; VI-LABEL: global_extload_v16f16_to_v16f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2207,9 +2039,6 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p
; CIVI-LABEL: global_truncstore_f32_to_f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -2241,9 +2070,6 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v2f32_to_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2261,9 +2087,6 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v2f32_to_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2300,9 +2123,6 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v3f32_to_v3f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2326,9 +2146,6 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v3f32_to_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2374,9 +2191,6 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v4f32_to_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2398,9 +2212,6 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v4f32_to_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2443,9 +2254,6 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v8f32_to_v8f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2481,9 +2289,6 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v8f32_to_v8f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2547,9 +2352,6 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; CI-LABEL: global_truncstore_v16f32_to_v16f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s4, s2, 32
; CI-NEXT: s_addc_u32 s5, s3, 0
@@ -2618,9 +2420,6 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; VI-LABEL: global_truncstore_v16f32_to_v16f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s2, 32
; VI-NEXT: s_addc_u32 s5, s3, 0
@@ -2731,9 +2530,6 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
; CI-LABEL: fadd_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -2751,9 +2547,6 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s3
@@ -2784,9 +2577,6 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
; CI-LABEL: fadd_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
@@ -2808,9 +2598,6 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
; VI-LABEL: fadd_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s3, 16
; VI-NEXT: s_lshr_b32 s5, s2, 16
@@ -2842,9 +2629,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-LABEL: fadd_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2882,9 +2666,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: fadd_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2925,9 +2706,6 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s10, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v4, s0
@@ -2986,9 +2764,6 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s10, s7, 16
; VI-NEXT: s_lshr_b32 s11, s3, 16
@@ -3049,9 +2824,6 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr
; CIVI-LABEL: test_bitcast_from_half:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
@@ -3081,9 +2853,6 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
; CIVI-LABEL: test_bitcast_to_half:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -3109,4 +2878,4 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
ret void
}
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 741ea419c2a45..cd89a36fe538b 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,8 +15,7 @@
; CHECK: .max_flat_workgroup_size: 1024
; CHECK: .name: test
; CHECK: .private_segment_fixed_size: 0
-; GFX700: .sgpr_count: 22
-; GFX803: .sgpr_count: 24
+; CHECK: .sgpr_count: 10
; CHECK: .symbol: test.kd
; CHECK: .vgpr_count: {{3|6}}
; WAVE64: .wavefront_size: 64
@@ -24,7 +23,7 @@
define amdgpu_kernel void @test(
ptr addrspace(1) %r,
ptr addrspace(1) %a,
- ptr addrspace(1) %b) "amdgpu-no-implicitarg-ptr" {
+ ptr addrspace(1) %b) "amdgpu-no-implicitarg-ptr" "amdgpu-no-flat-scratch-init" {
entry:
%a.val = load half, ptr addrspace(1) %a
%b.val = load half, ptr addrspace(1) %b
@@ -49,8 +48,8 @@ entry:
; CHECK: .name: num_spilled_sgprs
; GFX700: .sgpr_spill_count: 10
-; GFX803: .sgpr_spill_count: 0
-; GFX900: .sgpr_spill_count: 0
+; GFX803: .sgpr_spill_count: 10
+; GFX900: .sgpr_spill_count: 62
; GFX1010: .sgpr_spill_count: 60
; CHECK: .symbol: num_spilled_sgprs.kd
define amdgpu_kernel void @num_spilled_sgprs(
@@ -171,7 +170,7 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 {
; CHECK-NEXT: - 1
; CHECK-NEXT: - 1
-attributes #0 = { "amdgpu-num-sgpr"="20" }
+attributes #0 = { "amdgpu-num-sgpr"="20" "amdgpu-no-flat-scratch-init" }
attributes #1 = { "amdgpu-num-vgpr"="20" }
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 024593c49dba1..60469b25dc28c 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -43,7 +43,7 @@
; ELF: 00E0: 6E616D65 A673696D 706C65BB 2E707269
; ELF: 00F0: 76617465 5F736567 6D656E74 5F666978
; ELF: 0100: 65645F73 697A6500 AB2E7367 70725F63
-; ELF: 0110: 6F756E74 0EB12E73 6770725F 7370696C
+; ELF: 0110: 6F756E74 06B12E73 6770725F 7370696C
; ELF: 0120: 6C5F636F 756E7400 A72E7379 6D626F6C
; ELF: 0130: A973696D 706C652E 6B64AB2E 76677072
; ELF: 0140: 5F636F75 6E7403B1 2E766770 725F7370
@@ -59,7 +59,7 @@
; ELF: 01E0: 73696D70 6C655F6E 6F5F6B65 726E6172
; ELF: 01F0: 6773BB2E 70726976 6174655F 7365676D
; ELF: 0200: 656E745F 66697865 645F7369 7A6500AB
-; ELF: 0210: 2E736770 725F636F 756E740C B12E7367
+; ELF: 0210: 2E736770 725F636F 756E7400 B12E7367
; ELF: 0220: 70725F73 70696C6C 5F636F75 6E7400A7
; ELF: 0230: 2E73796D 626F6CB5 73696D70 6C655F6E
; ELF: 0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
@@ -120,7 +120,7 @@ entry:
ret void
}
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index c7489e90aec27..a97d9d314c651 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -7,14 +7,12 @@
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
-define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
+define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) #0 {
; GFX8V4-LABEL: addrspacecast:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT: s_add_i32 s12, s12, s17
-; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT: v_mov_b32_e32 v4, 1
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1
; GFX8V4-NEXT: s_cselect_b32 s3, s3, 0
@@ -24,7 +22,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V4-NEXT: v_mov_b32_e32 v1, s3
; GFX8V4-NEXT: s_cselect_b32 s0, s2, 0
; GFX8V4-NEXT: s_cselect_b32 s1, s1, 0
-; GFX8V4-NEXT: v_mov_b32_e32 v4, 1
; GFX8V4-NEXT: v_mov_b32_e32 v2, s1
; GFX8V4-NEXT: v_mov_b32_e32 v3, s0
; GFX8V4-NEXT: flat_store_dword v[0:1], v4
@@ -38,9 +35,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0xc8
-; GFX8V5-NEXT: s_add_i32 s12, s12, s17
-; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8V5-NEXT: v_mov_b32_e32 v4, 1
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1
; GFX8V5-NEXT: s_cselect_b32 s2, s2, 0
@@ -50,7 +45,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5-NEXT: v_mov_b32_e32 v1, s2
; GFX8V5-NEXT: s_cselect_b32 s0, s3, 0
; GFX8V5-NEXT: s_cselect_b32 s1, s1, 0
-; GFX8V5-NEXT: v_mov_b32_e32 v4, 1
; GFX8V5-NEXT: v_mov_b32_e32 v2, s1
; GFX8V5-NEXT: v_mov_b32_e32 v3, s0
; GFX8V5-NEXT: flat_store_dword v[0:1], v4
@@ -63,10 +57,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V4-LABEL: addrspacecast:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX9V4-NEXT: v_mov_b32_e32 v4, 1
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1
; GFX9V4-NEXT: s_cselect_b32 s2, s3, 0
@@ -76,7 +69,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V4-NEXT: v_mov_b32_e32 v1, s2
; GFX9V4-NEXT: s_cselect_b32 s0, s5, 0
; GFX9V4-NEXT: s_cselect_b32 s1, s1, 0
-; GFX9V4-NEXT: v_mov_b32_e32 v4, 1
; GFX9V4-NEXT: v_mov_b32_e32 v2, s1
; GFX9V4-NEXT: v_mov_b32_e32 v3, s0
; GFX9V4-NEXT: flat_store_dword v[0:1], v4
@@ -89,10 +81,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V5-LABEL: addrspacecast:
; GFX9V5: ; %bb.0:
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX9V5-NEXT: v_mov_b32_e32 v4, 1
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1
; GFX9V5-NEXT: s_cselect_b32 s2, s3, 0
@@ -102,7 +93,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V5-NEXT: v_mov_b32_e32 v1, s2
; GFX9V5-NEXT: s_cselect_b32 s0, s5, 0
; GFX9V5-NEXT: s_cselect_b32 s1, s1, 0
-; GFX9V5-NEXT: v_mov_b32_e32 v4, 1
; GFX9V5-NEXT: v_mov_b32_e32 v2, s1
; GFX9V5-NEXT: v_mov_b32_e32 v3, s0
; GFX9V5-NEXT: flat_store_dword v[0:1], v4
@@ -119,14 +109,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
ret void
}
-define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
+define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
; GFX8V4-LABEL: llvm_amdgcn_is_shared:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40
; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4
-; GFX8V4-NEXT: s_add_i32 s12, s12, s17
-; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -139,9 +126,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc
; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4
-; GFX8V5-NEXT: s_add_i32 s12, s12, s17
-; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -179,14 +163,11 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
ret void
}
-define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
+define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
; GFX8V4-LABEL: llvm_amdgcn_is_private:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44
; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4
-; GFX8V4-NEXT: s_add_i32 s12, s12, s17
-; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -199,9 +180,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8
; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4
-; GFX8V5-NEXT: s_add_i32 s12, s12, s17
-; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -282,13 +260,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
unreachable
}
-define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
+define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V4: ; %bb.0:
-; GFX8V4-NEXT: s_add_i32 s12, s12, s17
; GFX8V4-NEXT: v_mov_b32_e32 v0, s6
-; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8V4-NEXT: v_mov_b32_e32 v1, s7
; GFX8V4-NEXT: s_add_u32 s0, s8, 8
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -313,10 +288,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V5: ; %bb.0:
-; GFX8V5-NEXT: s_add_i32 s12, s12, s17
; GFX8V5-NEXT: v_mov_b32_e32 v0, s6
-; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8V5-NEXT: v_mov_b32_e32 v1, s7
; GFX8V5-NEXT: s_add_u32 s0, s8, 8
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -394,3 +366,5 @@ declare void @llvm.debugtrap()
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 696ea98254086..0284e6c07b8d6 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -5,18 +5,18 @@
; Make sure we only use one 128-bit register instead of 2 for i128 asm
; constraints
-define amdgpu_kernel void @s_input_output_i128() {
+define amdgpu_kernel void @s_input_output_i128() #0 {
; GFX908-LABEL: name: s_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %13
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %12
+ ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %12
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: s_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %11
- ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %10
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %10
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=s"()
@@ -24,18 +24,18 @@ define amdgpu_kernel void @s_input_output_i128() {
ret void
}
-define amdgpu_kernel void @v_input_output_i128() {
+define amdgpu_kernel void @v_input_output_i128() #0 {
; GFX908-LABEL: name: v_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %13
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %12
+ ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %12
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:VReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: v_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %11
- ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %10
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %10
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6619145 /* reguse:VReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=v"()
@@ -43,21 +43,23 @@ define amdgpu_kernel void @v_input_output_i128() {
ret void
}
-define amdgpu_kernel void @a_input_output_i128() {
+define amdgpu_kernel void @a_input_output_i128() #0 {
; GFX908-LABEL: name: a_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %13
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %12
+ ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %12
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: a_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %11
- ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %10
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %10
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = call i128 asm sideeffect "; def $0", "=a"()
call void asm sideeffect "; use $0", "a"(i128 %val)
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index b51cb9df8d784..d932b70a67c9f 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -22,9 +22,6 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
; VI-LABEL: s_insertelement_v2bf16_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -85,9 +82,6 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
; VI-LABEL: s_insertelement_v2bf16_1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -150,9 +144,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -225,9 +216,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -298,9 +286,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -373,9 +358,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -453,14 +435,11 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1)
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -552,17 +531,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, s4, v0, v4
@@ -635,17 +611,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, v0, s4, v4
@@ -716,17 +689,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, s4, v1, v4
@@ -799,17 +769,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, v1, s4, v4
@@ -886,12 +853,9 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -958,7 +922,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
+define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
; SI-LABEL: v_insertelement_v8bf16_3:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -984,12 +948,9 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
@@ -1040,7 +1001,7 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
ret void
}
-define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
+define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) #0 {
; SI-LABEL: v_insertelement_v8bf16_dynamic:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1104,12 +1065,9 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -1258,7 +1216,7 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
ret void
}
-define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
+define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
; SI-LABEL: v_insertelement_v16bf16_3:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1287,14 +1245,11 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -1356,7 +1311,7 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
ret void
}
-define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
+define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) #0 {
; SI-LABEL: v_insertelement_v16bf16_dynamic:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0
@@ -1462,14 +1417,11 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -1731,5 +1683,5 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 2cecbe376520d..8d84c232ec7f8 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -21,9 +21,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2i16_0:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -71,9 +68,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -90,9 +84,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -161,9 +152,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -184,9 +172,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -268,9 +253,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -286,9 +268,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -343,9 +322,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -365,9 +341,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -455,9 +428,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -480,9 +450,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -577,9 +544,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2i16_1:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -626,9 +590,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -645,9 +606,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -711,9 +669,6 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2f16_0:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -759,9 +714,6 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2f16_1:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -808,9 +760,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -829,9 +778,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -888,12 +834,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -910,12 +853,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -986,9 +926,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1007,9 +944,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1065,9 +999,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1086,9 +1017,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1156,9 +1084,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1177,9 +1102,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1247,9 +1169,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1268,9 +1187,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1325,9 +1241,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1346,9 +1259,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1403,9 +1313,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1424,9 +1331,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1495,9 +1399,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1516,9 +1417,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1593,9 +1491,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s4, s[4:5], 0x0
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1615,9 +1510,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1680,12 +1572,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -1704,12 +1593,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -1772,14 +1658,11 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1802,14 +1685,11 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_load_dword v4, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1878,17 +1758,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, s4, v0, v4
@@ -1900,12 +1777,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1977,17 +1851,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, v0, s4, v4
@@ -1999,12 +1870,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2076,17 +1944,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, s4, v1, v4
@@ -2098,12 +1963,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2175,17 +2037,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, v1, s4, v4
@@ -2197,12 +2056,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2274,17 +2130,14 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, s4, v1, v4
@@ -2296,12 +2149,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2379,9 +2229,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
@@ -2409,9 +2256,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: flat_load_dword v4, v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
@@ -2515,12 +2359,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -2544,12 +2385,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2597,7 +2435,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
ret void
}
-define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
+define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
; GFX9-LABEL: v_insertelement_v8f16_3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -2616,12 +2454,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
@@ -2639,12 +2474,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -2696,7 +2528,7 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
ret void
}
-define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
+define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
; GFX9-LABEL: v_insertelement_v8i16_6:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -2716,12 +2548,9 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -2739,12 +2568,9 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -2797,7 +2623,7 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
ret void
}
-define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
+define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) #0 {
; GFX9-LABEL: v_insertelement_v8f16_dynamic:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -2848,12 +2674,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -2905,12 +2728,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -3072,7 +2892,7 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
ret void
}
-define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
+define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
; GFX9-LABEL: v_insertelement_v16f16_3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -3094,14 +2914,11 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -3124,12 +2941,9 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s3
; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4
@@ -3197,7 +3011,7 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
ret void
}
-define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
+define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
; GFX9-LABEL: v_insertelement_v16i16_6:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -3220,14 +3034,12 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v12, 0x3020504
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -3235,7 +3047,6 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
-; VI-NEXT: v_mov_b32_e32 v12, 0x3020504
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_perm_b32 v3, s4, v3, v12
@@ -3249,14 +3060,11 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -3325,7 +3133,7 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
ret void
}
-define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
+define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) #0 {
; GFX9-LABEL: v_insertelement_v16f16_dynamic:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -3411,14 +3219,11 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -3511,14 +3316,11 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3]
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
@@ -3810,5 +3612,5 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
index 5dff7372ab561..2f7bae7d94a23 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -4,12 +4,9 @@
; Check illegal casts are codegened as poison, and not an error.
-define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) {
+define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) #0 {
; CHECK-LABEL: use_group_to_global_addrspacecast:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-NEXT: s_add_i32 s12, s12, s17
-; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: flat_store_dword v[0:1], v0
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -68,3 +65,5 @@ define amdgpu_kernel void @use_42_to_local_addrspacecast(ptr addrspace(42) %ptr)
%load = load volatile i32, ptr addrspace(3) %cast
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
index 55a5d50f06bbd..e71345d0fda7e 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
@@ -1,13 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
-define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) {
+define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) #1 {
; CHECK-LABEL: load_idx_idy:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
@@ -37,3 +35,4 @@ entry:
declare noundef nonnull align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 4edd0357c6e7a..1c1e7067d6730 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -7,7 +7,7 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
+define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) #0 {
; SI-LABEL: is_private_vgpr:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -30,12 +30,9 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x32
; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -62,13 +59,10 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x32
; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -116,7 +110,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; FIXME: setcc (zero_extend (setcc)), 1) not folded out, resulting in
; select and vcc branch.
-define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
+define amdgpu_kernel void @is_private_sgpr(ptr %ptr) #0 {
; SI-LABEL: is_private_sgpr:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s0, s[8:9], 0x1
@@ -139,9 +133,6 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; CI-SDAG: ; %bb.0:
; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1
; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x32
-; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1
; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -175,9 +166,6 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x32
-; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0
; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
@@ -240,6 +228,8 @@ bb1:
ret void
}
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
+
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index 9d078f7906b4d..5f1a03e575fb5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -7,7 +7,7 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
+define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) #0 {
; CIT-LABEL: is_local_vgpr:
; CIT: ; %bb.0:
; CIT-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -63,12 +63,9 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x33
; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -95,13 +92,10 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x33
; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -149,7 +143,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; FIXME: setcc (zero_extend (setcc)), 1) not folded out, resulting in
; select and vcc branch.
-define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
+define amdgpu_kernel void @is_local_sgpr(ptr %ptr) #0 {
; CIT-LABEL: is_local_sgpr:
; CIT: ; %bb.0:
; CIT-NEXT: s_load_dword s0, s[6:7], 0x1
@@ -206,9 +200,6 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; CI-SDAG: ; %bb.0:
; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1
; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x33
-; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1
; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -242,9 +233,6 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x33
-; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0
; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
@@ -307,6 +295,8 @@ bb1:
ret void
}
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
+
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 0fe371c1b51fe..839b8df7d249e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -20,14 +20,11 @@ define void @function_lds_id(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
+define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) #0 !llvm.amdgcn.lds.kernel.id !0 {
; GCN-LABEL: kernel_lds_id:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s2, s14, 42
+; GCN-NEXT: s_add_i32 s2, s12, 42
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
@@ -41,45 +38,36 @@ define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds
ret void
}
-define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !1 {
+define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) #0 !llvm.amdgcn.lds.kernel.id !1 {
; GCN-LABEL: indirect_lds_id:
; GCN: ; %bb.0:
; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_add_u32 s0, s0, s17
+; GCN-NEXT: s_add_u32 s0, s0, s15
; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_mov_b32 s13, s15
-; GCN-NEXT: s_mov_b32 s12, s14
-; GCN-NEXT: s_load_dwordx2 s[18:19], s[8:9], 0x0
+; GCN-NEXT: s_load_dwordx2 s[16:17], s[8:9], 0x0
; GCN-NEXT: s_add_u32 s8, s8, 8
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: s_addc_u32 s9, s9, 0
-; GCN-NEXT: s_getpc_b64 s[14:15]
-; GCN-NEXT: s_add_u32 s14, s14, function_lds_id at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s15, s15, function_lds_id at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dwordx2 s[20:21], s[14:15], 0x0
+; GCN-NEXT: s_getpc_b64 s[18:19]
+; GCN-NEXT: s_add_u32 s18, s18, function_lds_id at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s19, s19, function_lds_id at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: v_or_b32_e32 v31, v0, v2
; GCN-NEXT: s_mov_b32 s15, 21
-; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s18
-; GCN-NEXT: v_mov_b32_e32 v1, s19
-; GCN-NEXT: s_swappc_b64 s[30:31], s[20:21]
+; GCN-NEXT: v_mov_b32_e32 v0, s16
+; GCN-NEXT: v_mov_b32_e32 v1, s17
+; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: s_endpgm
call void @function_lds_id(ptr addrspace(1) %out)
ret void
}
-define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
+define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) #0 !llvm.amdgcn.lds.kernel.id !0 {
; GCN-LABEL: doesnt_use_it:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v2, 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -91,6 +79,7 @@ define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds
ret void
}
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
!0 = !{i32 42}
!1 = !{i32 21}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index cc9e34be209b4..5ce8d07380e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -280,13 +280,10 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -297,9 +294,6 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -311,17 +305,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -330,13 +321,10 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -345,17 +333,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -364,14 +349,11 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -380,19 +362,16 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_m0:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 m0, -1
; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
@@ -400,15 +379,12 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
; CHECK-GISEL-LABEL: test_readfirstlane_m0:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 m0, -1
; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -418,35 +394,29 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 s2, 0
; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -456,20 +426,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -477,16 +444,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -496,20 +460,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #0 {
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -517,16 +478,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -1643,3 +1601,4 @@ define void @test_readfirstlane_v32f16(ptr addrspace(1) %out, <32 x half> %src)
ret void
}
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index f2b0959cc706e..fb7b2775de608 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -179,9 +179,6 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -192,9 +189,6 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -210,13 +204,10 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -225,13 +216,10 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -244,13 +232,10 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -259,14 +244,11 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -280,9 +262,6 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -302,9 +281,6 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -335,9 +311,6 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -359,9 +332,6 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -395,9 +365,6 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -419,9 +386,6 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -455,15 +419,12 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
; CHECK-SDAG-LABEL: test_readlane_m0_sreg:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 m0, -1
; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
@@ -471,15 +432,12 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
; CHECK-GISEL-LABEL: test_readlane_m0_sreg:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 m0, -1
; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -496,14 +454,11 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
@@ -513,13 +468,10 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; def v0
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -533,17 +485,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -556,13 +505,10 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: ; def v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -577,17 +523,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -600,13 +543,10 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: ; def v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -621,31 +561,25 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou
; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 s2, 0
; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -659,16 +593,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -676,16 +607,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -699,16 +627,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -716,16 +641,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -975,5 +897,5 @@ define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
-attributes #1 = { nounwind }
+attributes #1 = { nounwind "amdgpu-no-flat-scratch-init" }
attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 4ac2cc98970b5..3e92d73918cdd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -15,9 +15,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
@@ -56,9 +53,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
@@ -104,9 +98,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
@@ -156,9 +147,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
@@ -214,9 +202,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
@@ -266,9 +251,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
@@ -324,9 +306,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -369,9 +348,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -420,9 +396,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
@@ -471,9 +444,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -528,14 +498,11 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
@@ -584,14 +551,11 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -645,9 +609,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -707,9 +668,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -780,9 +738,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -848,9 +803,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -925,9 +877,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -936,7 +886,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1]
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
@@ -997,9 +946,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -1009,7 +956,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000
; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0
; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1082,18 +1028,15 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
; GFX802-SDAG-NEXT: ;;#ASMSTART
; GFX802-SDAG-NEXT: s_mov_b32 m0, -1
; GFX802-SDAG-NEXT: ;;#ASMEND
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: s_mov_b32 s4, m0
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
; GFX802-SDAG-NEXT: s_endpgm
;
@@ -1138,18 +1081,15 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
; GFX802-GISEL-NEXT: ;;#ASMSTART
; GFX802-GISEL-NEXT: s_mov_b32 m0, -1
; GFX802-GISEL-NEXT: ;;#ASMEND
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: s_mov_b32 s4, m0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0
-; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX802-GISEL-NEXT: s_endpgm
;
@@ -1198,9 +1138,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -1243,9 +1180,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1293,9 +1227,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
; GFX802-SDAG-LABEL: test_writelane_imm_i64:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
@@ -1339,9 +1270,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
; GFX802-GISEL-LABEL: test_writelane_imm_i64:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -1391,9 +1319,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
; GFX802-SDAG-LABEL: test_writelane_imm_f64:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
@@ -1437,9 +1362,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
; GFX802-GISEL-LABEL: test_writelane_imm_f64:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -1490,9 +1412,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
@@ -1530,9 +1449,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
@@ -1576,13 +1492,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0
@@ -1625,14 +1538,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -1679,13 +1589,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0
@@ -1728,14 +1635,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -1780,10 +1684,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -1815,10 +1716,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1856,14 +1754,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
@@ -1902,14 +1797,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -1953,14 +1845,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10
-; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
-; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
@@ -1999,14 +1888,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -2804,5 +2690,5 @@ define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %sr
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
-attributes #1 = { nounwind }
+attributes #1 = { nounwind "amdgpu-no-flat-scratch-init"}
attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 919c1dfd4694e..6f638e33488bd 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -22,9 +22,6 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
; GFX7-HSA-LABEL: constant_load_f64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -63,10 +60,10 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
ret void
}
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
; Tests whether a load-chain of 8 constants of 64bit each gets vectorized into a wider load.
-define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) {
+define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) #0 {
; GFX6-NOHSA-LABEL: constant_load_2v4f64:
; GFX6-NOHSA: ; %bb.0: ; %entry
; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9
@@ -93,10 +90,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu
;
; GFX7-HSA-LABEL: constant_load_2v4f64:
; GFX7-HSA: ; %bb.0: ; %entry
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index a185157a553cf..469e2e0975b73 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
-define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GCN-NOHSA-SI-LABEL: constant_load_i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -27,9 +27,6 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: constant_load_i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -104,7 +101,7 @@ entry:
ret void
}
-define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GCN-NOHSA-SI-LABEL: constant_load_v2i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -120,9 +117,6 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v2i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -175,7 +169,7 @@ entry:
ret void
}
-define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GCN-NOHSA-SI-LABEL: constant_load_v3i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -194,9 +188,6 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v3i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: s_add_u32 s4, s0, 4
@@ -278,7 +269,7 @@ entry:
ret void
}
-define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GCN-NOHSA-SI-LABEL: constant_load_v4i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -295,9 +286,6 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v4i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -353,7 +341,7 @@ entry:
ret void
}
-define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GCN-NOHSA-SI-LABEL: constant_load_v8i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -372,9 +360,6 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v8i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -435,7 +420,7 @@ entry:
ret void
}
-define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GCN-NOHSA-SI-LABEL: constant_load_v16i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
@@ -460,9 +445,6 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
; GCN-HSA-LABEL: constant_load_v16i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GCN-HSA-NEXT: s_add_u32 s10, s8, 16
@@ -602,9 +584,6 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-HSA-LABEL: constant_load_v16i16_align2:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
@@ -858,9 +837,6 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_zextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -936,9 +912,6 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_sextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1015,9 +988,6 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1093,9 +1063,6 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1170,9 +1137,6 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1258,9 +1222,6 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1327,7 +1288,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
ret void
}
-define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GCN-NOHSA-SI-LABEL: constant_zextload_v3i16_to_v3i32:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1350,9 +1311,6 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1427,7 +1385,7 @@ entry:
ret void
}
-define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GCN-NOHSA-SI-LABEL: constant_sextload_v3i16_to_v3i32:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1450,9 +1408,6 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1555,9 +1510,6 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1664,9 +1616,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1784,9 +1733,6 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -1945,9 +1891,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2125,9 +2068,6 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2390,9 +2330,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2700,10 +2637,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3184,10 +3118,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3755,10 +3686,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4674,10 +4602,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -5464,9 +5389,6 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_zextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5564,9 +5486,6 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_sextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5665,9 +5584,6 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5760,9 +5676,6 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5860,15 +5773,12 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16
; GCN-HSA-NEXT: s_and_b32 s1, s2, 0xffff
@@ -5973,9 +5883,6 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -6079,13 +5986,10 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16
; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16
@@ -6238,9 +6142,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6397,13 +6298,10 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16
; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16
@@ -6618,9 +6516,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6882,13 +6777,10 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16
; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16
@@ -7270,9 +7162,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -7748,10 +7637,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -8474,10 +8360,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -9196,4 +9079,4 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; ret void
; }
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 68a6a148819e8..ad291e742ebef 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -23,9 +23,6 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
; GFX7-HSA-LABEL: constant_load_i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -106,9 +103,6 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v2i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -196,9 +190,6 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v3i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -293,9 +284,6 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v4i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -395,9 +383,6 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v8i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16
@@ -532,9 +517,6 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v9i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s12, s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -696,9 +678,6 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-LABEL: constant_load_v10i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -868,9 +847,6 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-LABEL: constant_load_v11i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -1047,9 +1023,6 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-LABEL: constant_load_v12i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -1229,10 +1202,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
;
; GFX7-HSA-LABEL: constant_load_v16i32:
; GFX7-HSA: ; %bb.0: ; %entry
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_add_u32 s18, s16, 48
@@ -1419,9 +1389,6 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX7-HSA-LABEL: constant_zextload_i32_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1506,9 +1473,6 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX7-HSA-LABEL: constant_sextload_i32_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1599,9 +1563,6 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1686,9 +1647,6 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1781,15 +1739,12 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
@@ -1882,9 +1837,6 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1997,16 +1949,13 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
@@ -2133,9 +2082,6 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2298,10 +2244,8 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48
@@ -2309,7 +2253,6 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11
@@ -2509,9 +2452,6 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2808,10 +2748,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
;
; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64:
; GFX7-HSA: ; %bb.0:
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3259,10 +3196,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
;
; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64:
; GFX7-HSA: ; %bb.0:
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3694,10 +3628,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
;
; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64:
; GFX7-HSA: ; %bb.0:
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4548,10 +4479,8 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xf0
; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
@@ -4580,7 +4509,6 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90
; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31
@@ -5169,10 +5097,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
;
; GFX7-HSA-LABEL: constant_load_v32i32:
; GFX7-HSA: ; %bb.0:
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -5477,4 +5402,4 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
ret void
}
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 2219ceea7ec9b..fa1b216e0c311 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -22,9 +22,6 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
; GFX7-LABEL: constant_load_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
@@ -98,9 +95,6 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-LABEL: constant_load_v2i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-NEXT: v_mov_b32_e32 v4, s0
@@ -185,9 +179,6 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-LABEL: constant_load_v3i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
@@ -303,9 +294,6 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-LABEL: constant_load_v4i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-NEXT: s_add_u32 s10, s8, 16
@@ -433,10 +421,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX7-LABEL: constant_load_v8i64:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-NEXT: s_add_u32 s18, s16, 48
@@ -653,10 +638,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
;
; GFX7-LABEL: constant_load_v16i64:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -945,4 +927,4 @@ entry:
ret void
}
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 4031be65fab61..d9d76ac16d608 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -27,9 +27,6 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
; GFX7-HSA-LABEL: constant_load_i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -115,9 +112,6 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v2i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -201,9 +195,6 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v3i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -314,9 +305,6 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v4i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -386,9 +374,6 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v8i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -463,9 +448,6 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v16i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -547,9 +529,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_zextload_i8_to_i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -625,9 +604,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_sextload_i8_to_i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -704,9 +680,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -782,9 +755,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -864,9 +834,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -966,9 +933,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1066,9 +1030,6 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1170,9 +1131,6 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1274,9 +1232,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1381,9 +1336,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1501,9 +1453,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -1663,9 +1612,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -1848,9 +1794,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2117,9 +2060,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2434,9 +2374,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2919,9 +2856,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3503,10 +3437,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
;
; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32:
; GFX7-HSA: ; %bb.0:
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4422,10 +4353,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
;
; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32:
; GFX7-HSA: ; %bb.0:
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -5233,9 +5161,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_zextload_i8_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5318,9 +5243,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_sextload_i8_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5406,9 +5328,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5489,9 +5408,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5580,9 +5496,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5690,9 +5603,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5806,13 +5716,10 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_bfe_u32 s4, s2, 0x80008
; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 24
@@ -5947,9 +5854,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6109,13 +6013,10 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 24
; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24
@@ -6334,9 +6235,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6606,13 +6504,10 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s8, s5, 24
; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 24
@@ -7003,9 +6898,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -7495,13 +7387,10 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s16, s8, 24
; GFX7-HSA-NEXT: s_lshr_b32 s17, s9, 24
@@ -8239,9 +8128,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -9012,9 +8898,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_zextload_i8_to_i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9099,9 +8982,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_sextload_i8_to_i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9188,9 +9068,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9275,9 +9152,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9367,9 +9241,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9469,9 +9340,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9584,9 +9452,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -9695,9 +9560,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -9821,9 +9683,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -9973,9 +9832,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -10158,9 +10014,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -10408,9 +10261,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -10724,9 +10574,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -11171,9 +11018,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -11601,4 +11445,4 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; ret void
; }
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 9054e509cde8e..ec2e6359ad46b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -7,7 +7,7 @@
; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
-define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-NOHSA-SI-LABEL: global_load_i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -28,9 +28,6 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(
; GCN-HSA-LABEL: global_load_i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -115,7 +112,7 @@ entry:
ret void
}
-define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-NOHSA-SI-LABEL: global_load_v2i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -136,9 +133,6 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v2i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -203,7 +197,7 @@ entry:
ret void
}
-define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-NOHSA-SI-LABEL: global_load_v3i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -225,9 +219,6 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v3i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -327,7 +318,7 @@ entry:
ret void
}
-define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-NOHSA-SI-LABEL: global_load_v4i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -348,9 +339,6 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v4i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -415,7 +403,7 @@ entry:
ret void
}
-define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-NOHSA-SI-LABEL: global_load_v8i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -436,9 +424,6 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v8i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -503,7 +488,7 @@ entry:
ret void
}
-define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-NOHSA-SI-LABEL: global_load_v16i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -527,9 +512,6 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
; GCN-HSA-LABEL: global_load_v16i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
@@ -680,9 +662,6 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
; GCN-HSA-LABEL: global_load_v16i16_align2:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -832,9 +811,6 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_zextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -920,9 +896,6 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_sextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1011,9 +984,6 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1099,9 +1069,6 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1192,9 +1159,6 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1294,9 +1258,6 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1373,7 +1334,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
ret void
}
-define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-NOHSA-SI-LABEL: global_zextload_v3i16_to_v3i32:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1398,9 +1359,6 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1486,7 +1444,7 @@ entry:
ret void
}
-define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
; GCN-NOHSA-SI-LABEL: global_sextload_v3i16_to_v3i32:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1511,9 +1469,6 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1631,9 +1586,6 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1749,9 +1701,6 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1874,9 +1823,6 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2026,9 +1972,6 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2193,9 +2136,6 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -2432,9 +2372,6 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2706,9 +2643,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -3120,9 +3054,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -3642,9 +3573,6 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -4449,9 +4377,6 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5217,9 +5142,6 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_zextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5317,9 +5239,6 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_sextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5415,9 +5334,6 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5510,9 +5426,6 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5611,9 +5524,6 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5723,9 +5633,6 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5844,9 +5751,6 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5992,9 +5896,6 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6155,10 +6056,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6173,11 +6074,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v14, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v16, v4
@@ -6377,9 +6275,6 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6630,10 +6525,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6650,10 +6545,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v18, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v20, v8
@@ -7013,9 +6905,6 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -7487,9 +7376,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -8192,9 +8078,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -8839,4 +8722,5 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; ret void
; }
-attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-no-flat-scratch-init" }
+attributes #1 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index e8c862a3cb93c..5652ab3ed4e70 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -27,9 +27,6 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(
; GCNX3-HSA-LABEL: global_load_i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -109,9 +106,6 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v2i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -192,9 +186,6 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v3i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -279,9 +270,6 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v4i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -364,9 +352,6 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v8i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
@@ -473,9 +458,6 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v9i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -607,9 +589,6 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v10i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -740,9 +719,6 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v11i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -878,9 +854,6 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v12i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -1014,9 +987,6 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v16i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -1164,9 +1134,6 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr
; GCNX3-HSA-LABEL: global_zextload_i32_to_i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1250,9 +1217,6 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr
; GCNX3-HSA-LABEL: global_sextload_i32_to_i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1337,9 +1301,6 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1423,9 +1384,6 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1513,9 +1471,6 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1614,9 +1569,6 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1722,10 +1674,8 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1733,7 +1683,6 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2
@@ -1851,9 +1800,6 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1995,10 +1941,8 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2013,7 +1957,6 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9
; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
@@ -2191,9 +2134,6 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2430,9 +2370,6 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2794,10 +2731,8 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -2831,7 +2766,6 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17
; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
@@ -3188,9 +3122,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -3658,12 +3589,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
;
; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64:
; GCN-GFX900-HSA: ; %bb.0:
-; GCN-GFX900-HSA-NEXT: s_mov_b64 s[22:23], s[2:3]
-; GCN-GFX900-HSA-NEXT: s_mov_b64 s[20:21], s[0:1]
+; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3]
+; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1]
; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0
-; GCN-GFX900-HSA-NEXT: s_add_u32 s20, s20, s17
-; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0
+; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15
+; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0
; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
@@ -3689,11 +3620,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[20:23], 0 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11
@@ -3736,11 +3667,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[20:23], 0 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51
@@ -3982,9 +3913,6 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -4509,9 +4437,6 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v32i32:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -4724,4 +4649,4 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
ret void
}
-attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 8d020b9e1a603..d74064a6da9c5 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -9,8 +9,6 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
; CHECK-LABEL: memcpy_p0_p0_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v12, s3
; CHECK-NEXT: v_mov_b32_e32 v11, s2
@@ -96,12 +94,12 @@ entry:
define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 {
; CHECK-LABEL: memcpy_p5_p4_minsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
-; CHECK-NEXT: s_add_u32 s20, s20, s17
+; CHECK-NEXT: s_add_u32 s16, s16, s15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
@@ -109,50 +107,50 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
-; CHECK-NEXT: s_addc_u32 s21, s21, 0
+; CHECK-NEXT: s_addc_u32 s17, s17, 0
; CHECK-NEXT: v_mov_b32_e32 v25, s2
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124
-; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120
-; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116
-; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
+; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
+; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
+; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108
-; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104
-; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100
-; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96
+; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
+; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
+; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
+; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92
-; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88
-; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84
-; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80
+; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
+; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
+; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
+; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76
-; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72
-; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68
-; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
+; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
+; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
+; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60
-; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56
-; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52
-; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48
+; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
+; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
+; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
+; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44
-; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40
-; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36
-; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32
+; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
+; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
+; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
+; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28
-; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -162,57 +160,55 @@ entry:
define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 {
; CHECK-LABEL: memcpy_p0_p5_minsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
-; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; CHECK-NEXT: s_add_u32 s20, s20, s17
-; CHECK-NEXT: s_addc_u32 s21, s21, 0
+; CHECK-NEXT: s_add_u32 s16, s16, s15
+; CHECK-NEXT: s_addc_u32 s17, s17, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v26, s0
-; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
-; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
-; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64
@@ -272,8 +268,6 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
-; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v21, s1
; CHECK-NEXT: v_mov_b32_e32 v20, s0
@@ -300,8 +294,6 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-LABEL: memcpy_p0_p0_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v12, s3
; CHECK-NEXT: v_mov_b32_e32 v11, s2
@@ -387,12 +379,12 @@ entry:
define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 {
; CHECK-LABEL: memcpy_p5_p4_optsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
-; CHECK-NEXT: s_add_u32 s20, s20, s17
+; CHECK-NEXT: s_add_u32 s16, s16, s15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
@@ -400,50 +392,50 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
-; CHECK-NEXT: s_addc_u32 s21, s21, 0
+; CHECK-NEXT: s_addc_u32 s17, s17, 0
; CHECK-NEXT: v_mov_b32_e32 v25, s2
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124
-; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120
-; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116
-; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
+; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
+; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
+; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108
-; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104
-; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100
-; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96
+; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
+; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
+; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
+; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92
-; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88
-; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84
-; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80
+; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
+; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
+; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
+; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76
-; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72
-; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68
-; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
+; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
+; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
+; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60
-; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56
-; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52
-; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48
+; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
+; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
+; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
+; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44
-; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40
-; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36
-; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32
+; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
+; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
+; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
+; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28
-; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -453,57 +445,55 @@ entry:
define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 {
; CHECK-LABEL: memcpy_p0_p5_optsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
-; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; CHECK-NEXT: s_add_u32 s20, s20, s17
-; CHECK-NEXT: s_addc_u32 s21, s21, 0
+; CHECK-NEXT: s_add_u32 s16, s16, s15
+; CHECK-NEXT: s_addc_u32 s17, s17, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v26, s0
-; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
-; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
-; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64
@@ -563,8 +553,6 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
-; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v21, s1
; CHECK-NEXT: v_mov_b32_e32 v20, s0
@@ -601,6 +589,6 @@ declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly
declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
-attributes #0 = { minsize "amdgpu-flat-work-group-size"="1024,1024" }
-attributes #1 = { optsize "amdgpu-flat-work-group-size"="1024,1024" }
+attributes #0 = { minsize "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-no-flat-scratch-init" }
+attributes #1 = { optsize "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-no-flat-scratch-init" }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 07ad8cb0c4a3d..b9916080dffcb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -15,9 +15,6 @@
define amdgpu_kernel void @flat_agent_unordered_load(
; GFX7-LABEL: flat_agent_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -32,10 +29,6 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX10-WGP-LABEL: flat_agent_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -50,10 +43,6 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX10-CU-LABEL: flat_agent_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -82,8 +71,6 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -96,8 +83,6 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -187,7 +172,7 @@ define amdgpu_kernel void @flat_agent_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("agent") unordered, align 4
store i32 %val, ptr %out
@@ -197,9 +182,6 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX7-LABEL: flat_agent_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -214,10 +196,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -232,10 +210,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX10-CU-LABEL: flat_agent_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -264,8 +238,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -278,8 +250,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -369,7 +339,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("agent") monotonic, align 4
store i32 %val, ptr %out
@@ -379,9 +349,6 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_load(
; GFX7-LABEL: flat_agent_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -397,10 +364,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX10-WGP-LABEL: flat_agent_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -417,10 +380,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX10-CU-LABEL: flat_agent_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -451,8 +410,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -466,8 +423,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -566,7 +521,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("agent") acquire, align 4
store i32 %val, ptr %out
@@ -576,9 +531,6 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX7-LABEL: flat_agent_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -595,10 +547,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -617,10 +565,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -654,8 +598,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -670,8 +612,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -789,7 +729,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4
store i32 %val, ptr %out
@@ -799,9 +739,6 @@ entry:
define amdgpu_kernel void @flat_agent_unordered_store(
; GFX7-LABEL: flat_agent_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -813,10 +750,6 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX10-WGP-LABEL: flat_agent_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -828,10 +761,6 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX10-CU-LABEL: flat_agent_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -854,8 +783,6 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -866,8 +793,6 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -939,7 +864,7 @@ define amdgpu_kernel void @flat_agent_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4
ret void
@@ -948,9 +873,6 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX7-LABEL: flat_agent_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -962,10 +884,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -977,10 +895,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX10-CU-LABEL: flat_agent_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1003,8 +917,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1015,8 +927,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1088,7 +998,7 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4
ret void
@@ -1097,9 +1007,6 @@ entry:
define amdgpu_kernel void @flat_agent_release_store(
; GFX7-LABEL: flat_agent_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1112,10 +1019,6 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX10-WGP-LABEL: flat_agent_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1129,10 +1032,6 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX10-CU-LABEL: flat_agent_release_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1158,8 +1057,6 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1171,8 +1068,6 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1261,7 +1156,7 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("agent") release, align 4
ret void
@@ -1270,9 +1165,6 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX7-LABEL: flat_agent_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1285,10 +1177,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1302,10 +1190,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1331,8 +1215,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1344,8 +1226,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1434,7 +1314,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4
ret void
@@ -1443,9 +1323,6 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX7-LABEL: flat_agent_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1457,10 +1334,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1472,10 +1345,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1498,8 +1367,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1510,8 +1377,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1583,7 +1448,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic
ret void
@@ -1592,9 +1457,6 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX7-LABEL: flat_agent_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1608,10 +1470,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1627,10 +1485,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1658,8 +1512,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1672,8 +1524,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1763,7 +1613,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
ret void
@@ -1772,9 +1622,6 @@ entry:
define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX7-LABEL: flat_agent_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1787,10 +1634,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1804,10 +1647,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1833,8 +1672,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1846,8 +1683,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1936,7 +1771,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release
ret void
@@ -1945,9 +1780,6 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX7-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1962,10 +1794,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1983,10 +1811,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2017,8 +1841,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2032,8 +1854,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2140,7 +1960,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
ret void
@@ -2149,9 +1969,6 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX7-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2166,10 +1983,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2187,10 +2000,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2221,8 +2030,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2236,8 +2043,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2344,7 +2149,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
ret void
@@ -2353,9 +2158,6 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2372,10 +2174,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2393,10 +2191,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2429,8 +2223,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2445,8 +2237,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2552,7 +2342,7 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
store i32 %val, ptr %out, align 4
@@ -2562,9 +2352,6 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2582,10 +2369,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2605,10 +2388,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2644,8 +2423,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2661,8 +2438,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2789,7 +2564,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
store i32 %val, ptr %out, align 4
@@ -2799,9 +2574,6 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2819,10 +2591,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2842,10 +2610,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2881,8 +2645,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2898,8 +2660,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -3026,7 +2786,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out, align 4
@@ -3036,9 +2796,6 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3064,10 +2821,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3093,10 +2846,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3147,8 +2896,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3163,8 +2910,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3264,7 +3009,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
@@ -3274,9 +3019,6 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3304,10 +3046,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3337,10 +3075,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3396,8 +3130,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3414,8 +3146,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3533,7 +3263,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
@@ -3543,9 +3273,6 @@ entry:
define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3572,10 +3299,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3603,10 +3326,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3660,8 +3379,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3677,8 +3394,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3795,7 +3510,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release monotonic
@@ -3805,9 +3520,6 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3836,10 +3548,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3871,10 +3579,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3933,8 +3637,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3952,8 +3654,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4088,7 +3788,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
@@ -4098,9 +3798,6 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4129,10 +3826,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4164,10 +3857,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4226,8 +3915,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4245,8 +3932,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4381,7 +4066,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
@@ -4391,9 +4076,6 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4421,10 +4103,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4454,10 +4132,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4513,8 +4187,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4531,8 +4203,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4650,7 +4320,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire
@@ -4660,9 +4330,6 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4690,10 +4357,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4723,10 +4386,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4782,8 +4441,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4800,8 +4457,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4919,7 +4574,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
@@ -4929,9 +4584,6 @@ entry:
define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4960,10 +4612,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4995,10 +4643,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5057,8 +4701,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5076,8 +4718,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5212,7 +4852,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release acquire
@@ -5222,9 +4862,6 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5253,10 +4890,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5288,10 +4921,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5350,8 +4979,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5369,8 +4996,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5505,7 +5130,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
@@ -5515,9 +5140,6 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5546,10 +5168,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5581,10 +5199,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5643,8 +5257,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5662,8 +5274,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5798,7 +5408,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
@@ -5808,9 +5418,6 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5839,10 +5446,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5874,10 +5477,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5936,8 +5535,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5955,8 +5552,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6091,7 +5686,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst
@@ -6101,9 +5696,6 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6132,10 +5724,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6167,10 +5755,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6229,8 +5813,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6248,8 +5830,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6384,7 +5964,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst
@@ -6394,9 +5974,6 @@ entry:
define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6425,10 +6002,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6460,10 +6033,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6522,8 +6091,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6541,8 +6108,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6677,7 +6242,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release seq_cst
@@ -6687,9 +6252,6 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6718,10 +6280,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6753,10 +6311,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6815,8 +6369,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6834,8 +6386,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6970,7 +6520,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst
@@ -6980,9 +6530,6 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -7011,10 +6558,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -7046,10 +6589,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -7108,8 +6647,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7127,8 +6664,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7263,7 +6798,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -7273,9 +6808,6 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7305,10 +6837,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7338,10 +6866,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7400,8 +6924,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7419,8 +6941,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7545,7 +7065,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
@@ -7557,9 +7077,6 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7590,10 +7107,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7625,10 +7138,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7689,8 +7198,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7709,8 +7216,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7844,7 +7349,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
@@ -7856,9 +7361,6 @@ entry:
define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7889,10 +7391,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7924,10 +7422,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7989,8 +7483,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8009,8 +7501,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8152,7 +7642,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release monotonic
@@ -8164,9 +7654,6 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8198,10 +7685,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8235,10 +7718,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8302,8 +7781,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8323,8 +7800,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8479,7 +7954,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
@@ -8491,9 +7966,6 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8525,10 +7997,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8562,10 +8030,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8629,8 +8093,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8650,8 +8112,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8806,7 +8266,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
@@ -8818,9 +8278,6 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8851,10 +8308,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8886,10 +8339,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8950,8 +8399,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8970,8 +8417,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9109,7 +8554,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire
@@ -9121,9 +8566,6 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9154,10 +8596,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9189,10 +8627,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9253,8 +8687,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9273,8 +8705,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9408,7 +8838,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
@@ -9420,9 +8850,6 @@ entry:
define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9454,10 +8881,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9491,10 +8914,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9558,8 +8977,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9579,8 +8996,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9735,7 +9150,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release acquire
@@ -9747,9 +9162,6 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9781,10 +9193,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9818,10 +9226,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9885,8 +9289,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9906,8 +9308,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10062,7 +9462,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
@@ -10074,9 +9474,6 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10108,10 +9505,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10145,10 +9538,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10212,8 +9601,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10233,8 +9620,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10389,7 +9774,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
@@ -10401,9 +9786,6 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10435,10 +9817,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10472,10 +9850,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10539,8 +9913,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10560,8 +9932,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10716,7 +10086,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst
@@ -10728,9 +10098,6 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10762,10 +10129,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10799,10 +10162,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10866,8 +10225,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10887,8 +10244,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11039,7 +10394,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst
@@ -11051,9 +10406,6 @@ entry:
define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11085,10 +10437,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11122,10 +10470,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11189,8 +10533,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11210,8 +10552,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11366,7 +10706,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release seq_cst
@@ -11378,9 +10718,6 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11412,10 +10749,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11449,10 +10782,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11516,8 +10845,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11537,8 +10864,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11693,7 +11018,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst
@@ -11705,9 +11030,6 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11739,10 +11061,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11776,10 +11094,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11843,8 +11157,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11864,8 +11176,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12020,7 +11330,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -12032,9 +11342,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX7-LABEL: flat_agent_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12049,10 +11356,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12067,10 +11370,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12099,8 +11398,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12113,8 +11410,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12204,7 +11499,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") unordered, align 4
store i32 %val, ptr %out
@@ -12214,9 +11509,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX7-LABEL: flat_agent_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12231,10 +11523,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12249,10 +11537,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12281,8 +11565,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12295,8 +11577,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12386,7 +11666,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") monotonic, align 4
store i32 %val, ptr %out
@@ -12396,9 +11676,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX7-LABEL: flat_agent_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12415,10 +11692,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12436,10 +11709,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12472,8 +11741,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12488,8 +11755,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12593,7 +11858,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") acquire, align 4
store i32 %val, ptr %out
@@ -12603,9 +11868,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX7-LABEL: flat_agent_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12623,10 +11885,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12646,10 +11904,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12685,8 +11939,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12702,8 +11954,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12826,7 +12076,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4
store i32 %val, ptr %out
@@ -12836,9 +12086,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX7-LABEL: flat_agent_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12850,10 +12097,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12865,10 +12108,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12891,8 +12130,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12903,8 +12140,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12976,7 +12211,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4
ret void
@@ -12985,9 +12220,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX7-LABEL: flat_agent_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12999,10 +12231,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13014,10 +12242,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13040,8 +12264,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13052,8 +12274,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13125,7 +12345,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4
ret void
@@ -13134,9 +12354,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX7-LABEL: flat_agent_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13149,10 +12366,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13166,10 +12379,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13195,8 +12404,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13208,8 +12415,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13298,7 +12503,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4
ret void
@@ -13307,9 +12512,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX7-LABEL: flat_agent_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13322,10 +12524,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13339,10 +12537,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13368,8 +12562,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13381,8 +12573,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13471,7 +12661,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4
ret void
@@ -13480,9 +12670,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13494,10 +12681,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13509,10 +12692,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13535,8 +12714,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13547,8 +12724,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13620,7 +12795,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic
ret void
@@ -13629,9 +12804,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13645,10 +12817,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13663,10 +12831,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13693,8 +12857,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13707,8 +12869,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13796,7 +12956,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
ret void
@@ -13805,9 +12965,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13820,10 +12977,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13837,10 +12990,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13866,8 +13015,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13879,8 +13026,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13969,7 +13114,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release
ret void
@@ -13978,9 +13123,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13995,10 +13137,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14015,10 +13153,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14048,8 +13182,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14063,8 +13195,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14169,7 +13299,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
ret void
@@ -14178,9 +13308,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14195,10 +13322,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14215,10 +13338,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14248,8 +13367,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14263,8 +13380,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14369,7 +13484,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
ret void
@@ -14378,9 +13493,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14398,10 +13510,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14420,10 +13528,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14458,8 +13562,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14475,8 +13577,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14587,7 +13687,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
store i32 %val, ptr %out, align 4
@@ -14597,9 +13697,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14618,10 +13715,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14642,10 +13735,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14683,8 +13772,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14701,8 +13788,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14834,7 +13919,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
store i32 %val, ptr %out, align 4
@@ -14844,9 +13929,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14865,10 +13947,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14889,10 +13967,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14930,8 +14004,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14948,8 +14020,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -15081,7 +14151,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
store i32 %val, ptr %out, align 4
@@ -15091,9 +14161,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15119,10 +14186,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15148,10 +14211,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15202,8 +14261,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15218,8 +14275,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15319,7 +14374,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
@@ -15329,9 +14384,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15359,10 +14411,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15391,10 +14439,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15449,8 +14493,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15467,8 +14509,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15584,7 +14624,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
@@ -15594,9 +14634,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15623,10 +14660,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15654,10 +14687,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15711,8 +14740,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15728,8 +14755,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15846,7 +14871,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic
@@ -15856,9 +14881,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15887,10 +14909,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15921,10 +14939,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15982,8 +14996,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16001,8 +15013,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16135,7 +15145,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
@@ -16145,9 +15155,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16176,10 +15183,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16210,10 +15213,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16271,8 +15270,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16290,8 +15287,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16424,7 +15419,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
@@ -16434,9 +15429,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16464,10 +15456,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16496,10 +15484,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16554,8 +15538,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16572,8 +15554,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16689,7 +15669,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire
@@ -16699,9 +15679,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16729,10 +15706,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16761,10 +15734,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16819,8 +15788,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16837,8 +15804,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16954,7 +15919,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
@@ -16964,9 +15929,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16995,10 +15957,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17029,10 +15987,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17090,8 +16044,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17109,8 +16061,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17243,7 +16193,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
@@ -17253,9 +16203,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17284,10 +16231,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17318,10 +16261,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17379,8 +16318,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17398,8 +16335,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17532,7 +16467,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
@@ -17542,9 +16477,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17573,10 +16505,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17607,10 +16535,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17668,8 +16592,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17687,8 +16609,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17821,7 +16741,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
@@ -17831,9 +16751,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17862,10 +16779,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17896,10 +16809,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17957,8 +16866,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17976,8 +16883,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18110,7 +17015,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst
@@ -18120,9 +17025,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18151,10 +17053,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18185,10 +17083,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18246,8 +17140,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18265,8 +17157,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18399,7 +17289,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst
@@ -18409,9 +17299,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18440,10 +17327,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18474,10 +17357,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18535,8 +17414,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18554,8 +17431,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18688,7 +17563,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst
@@ -18698,9 +17573,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18729,10 +17601,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18763,10 +17631,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18824,8 +17688,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18843,8 +17705,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18977,7 +17837,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst
@@ -18987,9 +17847,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -19018,10 +17875,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -19052,10 +17905,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -19113,8 +17962,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19132,8 +17979,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19266,7 +18111,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
@@ -19276,9 +18121,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19308,10 +18150,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19341,10 +18179,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19403,8 +18237,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19422,8 +18254,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19548,7 +18378,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
@@ -19560,9 +18390,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19594,10 +18421,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19630,10 +18453,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19696,8 +18515,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19717,8 +18534,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19857,7 +18672,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
@@ -19869,9 +18684,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19902,10 +18714,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19937,10 +18745,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20002,8 +18806,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20022,8 +18824,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20165,7 +18965,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic
@@ -20177,9 +18977,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20212,10 +19009,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20250,10 +19043,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20319,8 +19108,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20341,8 +19128,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20502,7 +19287,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
@@ -20514,9 +19299,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20549,10 +19331,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20587,10 +19365,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20656,8 +19430,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20678,8 +19450,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20839,7 +19609,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
@@ -20851,9 +19621,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20885,10 +19652,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20921,10 +19684,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20987,8 +19746,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21008,8 +19765,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21152,7 +19907,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire
@@ -21164,9 +19919,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21198,10 +19950,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21234,10 +19982,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21300,8 +20044,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21321,8 +20063,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21461,7 +20201,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
@@ -21473,9 +20213,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21508,10 +20245,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21546,10 +20279,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21615,8 +20344,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21637,8 +20364,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21798,7 +20523,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
@@ -21810,9 +20535,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21845,10 +20567,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21883,10 +20601,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21952,8 +20666,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21974,8 +20686,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22135,7 +20845,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
@@ -22147,9 +20857,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22182,10 +20889,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22220,10 +20923,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22289,8 +20988,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22311,8 +21008,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22472,7 +21167,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
@@ -22484,9 +21179,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22519,10 +21211,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22557,10 +21245,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22626,8 +21310,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22648,8 +21330,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22809,7 +21489,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst
@@ -22821,9 +21501,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22856,10 +21533,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22894,10 +21567,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22963,8 +21632,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22985,8 +21652,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -23142,7 +21807,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst
@@ -23154,9 +21819,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -23189,10 +21851,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -23227,10 +21885,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -23296,8 +21950,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -23318,8 +21970,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -23479,7 +22129,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst
@@ -23491,9 +22141,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -23526,10 +22173,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -23564,10 +22207,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -23633,8 +22272,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -23655,8 +22292,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -23816,7 +22451,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst
@@ -23828,9 +22463,6 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -23863,10 +22495,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -23901,10 +22529,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -23970,8 +22594,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -23992,8 +22614,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -24153,7 +22773,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
@@ -24161,3 +22781,5 @@ entry:
store i32 %val0, ptr %out, align 4
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 3c24c36ec547d..12ed89a163a6b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -15,9 +15,6 @@
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -32,10 +29,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -50,10 +43,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-CU-LABEL: flat_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -82,8 +71,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -96,8 +83,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -187,7 +172,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load i32, ptr %in, align 4, !nontemporal !0
store i32 %val, ptr %out
@@ -197,9 +182,6 @@ entry:
define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX7-LABEL: flat_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -229,10 +211,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
@@ -262,10 +240,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-CU-LABEL: flat_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
@@ -324,8 +298,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
@@ -357,8 +329,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
@@ -555,7 +525,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr %in, i32 %tid
@@ -567,9 +537,6 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX7-LABEL: flat_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -584,10 +551,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -602,10 +565,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-CU-LABEL: flat_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -634,8 +593,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -648,8 +605,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -739,7 +694,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load i32, ptr %in, align 4
store i32 %val, ptr %out, !nontemporal !0
@@ -749,9 +704,6 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX7-LABEL: flat_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -780,10 +732,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -811,10 +759,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-CU-LABEL: flat_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -870,8 +814,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -901,8 +843,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1095,7 +1035,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr %in, align 4
@@ -1107,9 +1047,6 @@ entry:
define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX7-LABEL: flat_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1125,10 +1062,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1144,10 +1077,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1178,8 +1107,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1193,8 +1120,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1293,12 +1218,13 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load volatile i32, ptr %in, align 4, !nontemporal !0
store i32 %val, ptr %out
ret void
}
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index b88a10ab24a98..3dd82b74a2b5d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -15,9 +15,6 @@
define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX7-LABEL: flat_singlethread_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -32,10 +29,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -50,10 +43,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX10-CU-LABEL: flat_singlethread_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -82,8 +71,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -96,8 +83,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -187,7 +172,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") unordered, align 4
store i32 %val, ptr %out
@@ -197,9 +182,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX7-LABEL: flat_singlethread_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -214,10 +196,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -232,10 +210,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -264,8 +238,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -278,8 +250,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -369,7 +339,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") monotonic, align 4
store i32 %val, ptr %out
@@ -379,9 +349,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX7-LABEL: flat_singlethread_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -396,10 +363,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -414,10 +377,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -446,8 +405,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -460,8 +417,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -551,7 +506,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") acquire, align 4
store i32 %val, ptr %out
@@ -561,9 +516,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX7-LABEL: flat_singlethread_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -578,10 +530,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -596,10 +544,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -628,8 +572,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -642,8 +584,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -733,7 +673,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") seq_cst, align 4
store i32 %val, ptr %out
@@ -743,9 +683,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX7-LABEL: flat_singlethread_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -757,10 +694,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -772,10 +705,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX10-CU-LABEL: flat_singlethread_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -798,8 +727,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -810,8 +737,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -883,7 +808,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4
ret void
@@ -892,9 +817,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX7-LABEL: flat_singlethread_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -906,10 +828,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -921,10 +839,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -947,8 +861,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -959,8 +871,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1032,7 +942,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4
ret void
@@ -1041,9 +951,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_store(
; GFX7-LABEL: flat_singlethread_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1055,10 +962,6 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX10-WGP-LABEL: flat_singlethread_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1070,10 +973,6 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX10-CU-LABEL: flat_singlethread_release_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1096,8 +995,6 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1108,8 +1005,6 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1181,7 +1076,7 @@ define amdgpu_kernel void @flat_singlethread_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4
ret void
@@ -1190,9 +1085,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX7-LABEL: flat_singlethread_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1204,10 +1096,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1219,10 +1107,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1245,8 +1129,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1257,8 +1139,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1330,7 +1210,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4
ret void
@@ -1339,9 +1219,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1353,10 +1230,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1368,10 +1241,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1394,8 +1263,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1406,8 +1273,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1479,7 +1344,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic
ret void
@@ -1488,9 +1353,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1502,10 +1364,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1517,10 +1375,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1543,8 +1397,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1555,8 +1407,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1628,7 +1478,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
ret void
@@ -1637,9 +1487,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX7-LABEL: flat_singlethread_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1651,10 +1498,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1666,10 +1509,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1692,8 +1531,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1704,8 +1541,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1777,7 +1612,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release
ret void
@@ -1786,9 +1621,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1800,10 +1632,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1815,10 +1643,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1841,8 +1665,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1853,8 +1675,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1926,7 +1746,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
ret void
@@ -1935,9 +1755,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1949,10 +1766,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1964,10 +1777,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1990,8 +1799,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2002,8 +1809,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2075,7 +1880,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
ret void
@@ -2084,9 +1889,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2102,10 +1904,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2121,10 +1919,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2155,8 +1949,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2170,8 +1962,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2268,7 +2058,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
store i32 %val, ptr %out, align 4
@@ -2278,9 +2068,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2296,10 +2083,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2315,10 +2098,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2349,8 +2128,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2364,8 +2141,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2462,7 +2237,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
store i32 %val, ptr %out, align 4
@@ -2472,9 +2247,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2490,10 +2262,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2509,10 +2277,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2543,8 +2307,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2558,8 +2320,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2656,7 +2416,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
store i32 %val, ptr %out, align 4
@@ -2666,9 +2426,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2694,10 +2451,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2723,10 +2476,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2777,8 +2526,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2793,8 +2540,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2894,7 +2639,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
@@ -2904,9 +2649,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2932,10 +2674,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2961,10 +2699,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3015,8 +2749,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3031,8 +2763,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3132,7 +2862,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
@@ -3142,9 +2872,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3170,10 +2897,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3199,10 +2922,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3253,8 +2972,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3269,8 +2986,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3370,7 +3085,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
@@ -3380,9 +3095,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3408,10 +3120,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3437,10 +3145,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3491,8 +3195,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3507,8 +3209,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3608,7 +3308,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
@@ -3618,9 +3318,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3646,10 +3343,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3675,10 +3368,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3729,8 +3418,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3745,8 +3432,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3846,7 +3531,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
@@ -3856,9 +3541,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3884,10 +3566,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3913,10 +3591,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3967,8 +3641,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3983,8 +3655,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4084,7 +3754,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
@@ -4094,9 +3764,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4122,10 +3789,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4151,10 +3814,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4205,8 +3864,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4221,8 +3878,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4322,7 +3977,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
@@ -4332,9 +3987,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4360,10 +4012,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4389,10 +4037,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4443,8 +4087,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4459,8 +4101,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4560,7 +4200,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
@@ -4570,9 +4210,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4598,10 +4235,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4627,10 +4260,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4681,8 +4310,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4697,8 +4324,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4798,7 +4423,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
@@ -4808,9 +4433,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4836,10 +4458,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4865,10 +4483,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4919,8 +4533,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4935,8 +4547,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5036,7 +4646,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
@@ -5046,9 +4656,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5074,10 +4681,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5103,10 +4706,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5157,8 +4756,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5173,8 +4770,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5274,7 +4869,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
@@ -5284,9 +4879,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5312,10 +4904,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5341,10 +4929,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5395,8 +4979,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5411,8 +4993,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5512,7 +5092,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
@@ -5522,9 +5102,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5550,10 +5127,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5579,10 +5152,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5633,8 +5202,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5649,8 +5216,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5750,7 +5315,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
@@ -5760,9 +5325,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5788,10 +5350,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5817,10 +5375,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5871,8 +5425,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5887,8 +5439,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5988,7 +5538,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
@@ -5998,9 +5548,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6026,10 +5573,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6055,10 +5598,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6109,8 +5648,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6125,8 +5662,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6226,7 +5761,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
@@ -6236,9 +5771,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6268,10 +5800,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6301,10 +5829,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6363,8 +5887,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6382,8 +5904,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6508,7 +6028,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
@@ -6520,9 +6040,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6552,10 +6069,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6585,10 +6098,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6647,8 +6156,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6666,8 +6173,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6792,7 +6297,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
@@ -6804,9 +6309,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6836,10 +6338,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6869,10 +6367,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6931,8 +6425,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6950,8 +6442,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7076,7 +6566,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
@@ -7088,9 +6578,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7120,10 +6607,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7153,10 +6636,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7215,8 +6694,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7234,8 +6711,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7360,7 +6835,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
@@ -7372,9 +6847,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7404,10 +6876,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7437,10 +6905,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7499,8 +6963,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7518,8 +6980,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7644,7 +7104,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
@@ -7656,9 +7116,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7688,10 +7145,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7721,10 +7174,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7783,8 +7232,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7802,8 +7249,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7928,7 +7373,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
@@ -7940,9 +7385,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7972,10 +7414,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8005,10 +7443,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8067,8 +7501,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8086,8 +7518,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8212,7 +7642,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
@@ -8224,9 +7654,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8256,10 +7683,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8289,10 +7712,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8351,8 +7770,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8370,8 +7787,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8496,7 +7911,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
@@ -8508,9 +7923,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8540,10 +7952,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8573,10 +7981,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8635,8 +8039,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8654,8 +8056,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8780,7 +8180,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
@@ -8792,9 +8192,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8824,10 +8221,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8857,10 +8250,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8919,8 +8308,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8938,8 +8325,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9064,7 +8449,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
@@ -9076,9 +8461,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9108,10 +8490,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9141,10 +8519,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9203,8 +8577,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9222,8 +8594,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9348,7 +8718,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
@@ -9360,9 +8730,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9392,10 +8759,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9425,10 +8788,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9487,8 +8846,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9506,8 +8863,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9632,7 +8987,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
@@ -9644,9 +8999,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9676,10 +9028,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9709,10 +9057,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9771,8 +9115,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9790,8 +9132,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9916,7 +9256,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
@@ -9928,9 +9268,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9960,10 +9297,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9993,10 +9326,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10055,8 +9384,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10074,8 +9401,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10200,7 +9525,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
@@ -10212,9 +9537,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10244,10 +9566,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10277,10 +9595,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10339,8 +9653,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10358,8 +9670,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10484,7 +9794,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
@@ -10496,9 +9806,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10513,10 +9820,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10531,10 +9834,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10563,8 +9862,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10577,8 +9874,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10668,7 +9963,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") unordered, align 4
store i32 %val, ptr %out
@@ -10678,9 +9973,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10695,10 +9987,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10713,10 +10001,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10745,8 +10029,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10759,8 +10041,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10850,7 +10130,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") monotonic, align 4
store i32 %val, ptr %out
@@ -10860,9 +10140,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10877,10 +10154,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10895,10 +10168,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10927,8 +10196,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10941,8 +10208,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11032,7 +10297,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") acquire, align 4
store i32 %val, ptr %out
@@ -11042,9 +10307,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11059,10 +10321,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11077,10 +10335,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11109,8 +10363,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11123,8 +10375,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11214,7 +10464,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") seq_cst, align 4
store i32 %val, ptr %out
@@ -11224,9 +10474,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11238,10 +10485,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11253,10 +10496,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11279,8 +10518,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11291,8 +10528,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11364,7 +10599,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4
ret void
@@ -11373,9 +10608,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11387,10 +10619,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11402,10 +10630,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11428,8 +10652,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11440,8 +10662,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11513,7 +10733,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4
ret void
@@ -11522,9 +10742,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX7-LABEL: flat_singlethread_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11536,10 +10753,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11551,10 +10764,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11577,8 +10786,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11589,8 +10796,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11662,7 +10867,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4
ret void
@@ -11671,9 +10876,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11685,10 +10887,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11700,10 +10898,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11726,8 +10920,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11738,8 +10930,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11811,7 +11001,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4
ret void
@@ -11820,9 +11010,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11834,10 +11021,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11849,10 +11032,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11875,8 +11054,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11887,8 +11064,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11960,7 +11135,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic
ret void
@@ -11969,9 +11144,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11983,10 +11155,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11998,10 +11166,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12024,8 +11188,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12036,8 +11198,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12109,7 +11269,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
ret void
@@ -12118,9 +11278,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12132,10 +11289,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12147,10 +11300,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12173,8 +11322,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12185,8 +11332,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12258,7 +11403,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release
ret void
@@ -12267,9 +11412,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12281,10 +11423,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12296,10 +11434,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12322,8 +11456,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12334,8 +11466,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12407,7 +11537,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
ret void
@@ -12416,9 +11546,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12430,10 +11557,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12445,10 +11568,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12471,8 +11590,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12483,8 +11600,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12556,7 +11671,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
ret void
@@ -12565,9 +11680,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12583,10 +11695,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12602,10 +11710,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12636,8 +11740,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12651,8 +11753,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12749,7 +11849,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
store i32 %val, ptr %out, align 4
@@ -12759,9 +11859,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12777,10 +11874,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12796,10 +11889,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12830,8 +11919,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12845,8 +11932,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12943,7 +12028,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
store i32 %val, ptr %out, align 4
@@ -12953,9 +12038,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12971,10 +12053,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12990,10 +12068,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13024,8 +12098,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13039,8 +12111,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13137,7 +12207,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
store i32 %val, ptr %out, align 4
@@ -13147,9 +12217,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13175,10 +12242,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13204,10 +12267,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13258,8 +12317,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13274,8 +12331,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13375,7 +12430,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
@@ -13385,9 +12440,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13413,10 +12465,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13442,10 +12490,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13496,8 +12540,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13512,8 +12554,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13613,7 +12653,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
@@ -13623,9 +12663,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13651,10 +12688,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13680,10 +12713,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13734,8 +12763,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13750,8 +12777,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13851,7 +12876,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
@@ -13861,9 +12886,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13889,10 +12911,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13918,10 +12936,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13972,8 +12986,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13988,8 +13000,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14089,7 +13099,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
@@ -14099,9 +13109,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14127,10 +13134,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14156,10 +13159,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14210,8 +13209,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14226,8 +13223,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14327,7 +13322,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
@@ -14337,9 +13332,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14365,10 +13357,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14394,10 +13382,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14448,8 +13432,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14464,8 +13446,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14565,7 +13545,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
@@ -14575,9 +13555,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14603,10 +13580,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14632,10 +13605,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14686,8 +13655,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14702,8 +13669,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14803,7 +13768,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
@@ -14813,9 +13778,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14841,10 +13803,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14870,10 +13828,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14924,8 +13878,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14940,8 +13892,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15041,7 +13991,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
@@ -15051,9 +14001,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15079,10 +14026,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15108,10 +14051,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15162,8 +14101,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15178,8 +14115,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15279,7 +14214,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
@@ -15289,9 +14224,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15317,10 +14249,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15346,10 +14274,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15400,8 +14324,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15416,8 +14338,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15517,7 +14437,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
@@ -15527,9 +14447,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15555,10 +14472,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15584,10 +14497,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15638,8 +14547,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15654,8 +14561,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15755,7 +14660,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
@@ -15765,9 +14670,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15793,10 +14695,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15822,10 +14720,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15876,8 +14770,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15892,8 +14784,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15993,7 +14883,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
@@ -16003,9 +14893,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16031,10 +14918,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16060,10 +14943,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16114,8 +14993,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16130,8 +15007,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16231,7 +15106,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
@@ -16241,9 +15116,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16269,10 +15141,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16298,10 +15166,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16352,8 +15216,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16368,8 +15230,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16469,7 +15329,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
@@ -16479,9 +15339,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16507,10 +15364,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16536,10 +15389,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16590,8 +15439,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16606,8 +15453,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16707,7 +15552,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
@@ -16717,9 +15562,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16749,10 +15591,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16782,10 +15620,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16844,8 +15678,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16863,8 +15695,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16989,7 +15819,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
@@ -17001,9 +15831,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17033,10 +15860,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17066,10 +15889,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17128,8 +15947,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17147,8 +15964,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17273,7 +16088,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
@@ -17285,9 +16100,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17317,10 +16129,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17350,10 +16158,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17412,8 +16216,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17431,8 +16233,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17557,7 +16357,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
@@ -17569,9 +16369,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17601,10 +16398,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17634,10 +16427,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17696,8 +16485,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17715,8 +16502,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17841,7 +16626,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
@@ -17853,9 +16638,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17885,10 +16667,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17918,10 +16696,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17980,8 +16754,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17999,8 +16771,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18125,7 +16895,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
@@ -18137,9 +16907,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18169,10 +16936,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18202,10 +16965,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18264,8 +17023,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18283,8 +17040,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18409,7 +17164,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
@@ -18421,9 +17176,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18453,10 +17205,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18486,10 +17234,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18548,8 +17292,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18567,8 +17309,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18693,7 +17433,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
@@ -18705,9 +17445,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18737,10 +17474,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18770,10 +17503,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18832,8 +17561,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18851,8 +17578,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18977,7 +17702,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
@@ -18989,9 +17714,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19021,10 +17743,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19054,10 +17772,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19116,8 +17830,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19135,8 +17847,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19261,7 +17971,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
@@ -19273,9 +17983,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19305,10 +18012,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19338,10 +18041,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19400,8 +18099,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19419,8 +18116,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19545,7 +18240,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
@@ -19557,9 +18252,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19589,10 +18281,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19622,10 +18310,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19684,8 +18368,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19703,8 +18385,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19829,7 +18509,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
@@ -19841,9 +18521,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19873,10 +18550,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19906,10 +18579,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19968,8 +18637,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19987,8 +18654,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20113,7 +18778,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
@@ -20125,9 +18790,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20157,10 +18819,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20190,10 +18848,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20252,8 +18906,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20271,8 +18923,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20397,7 +19047,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
@@ -20409,9 +19059,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20441,10 +19088,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20474,10 +19117,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20536,8 +19175,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20555,8 +19192,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20681,7 +19316,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
@@ -20693,9 +19328,6 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20725,10 +19357,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20758,10 +19386,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20820,8 +19444,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20839,8 +19461,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20965,7 +19585,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
@@ -20973,3 +19593,6 @@ entry:
store i32 %val0, ptr %out, align 4
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
+
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 919fc3e8f4e4f..dc30a1e2de77a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -15,9 +15,6 @@
define amdgpu_kernel void @flat_system_unordered_load(
; GFX7-LABEL: flat_system_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -32,10 +29,6 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX10-WGP-LABEL: flat_system_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -50,10 +43,6 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX10-CU-LABEL: flat_system_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -82,8 +71,6 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -96,8 +83,6 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -187,7 +172,7 @@ define amdgpu_kernel void @flat_system_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in unordered, align 4
store i32 %val, ptr %out
@@ -197,9 +182,6 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_load(
; GFX7-LABEL: flat_system_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -214,10 +196,6 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX10-WGP-LABEL: flat_system_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -232,10 +210,6 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX10-CU-LABEL: flat_system_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -264,8 +238,6 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -278,8 +250,6 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -369,7 +339,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in monotonic, align 4
store i32 %val, ptr %out
@@ -379,9 +349,6 @@ entry:
define amdgpu_kernel void @flat_system_acquire_load(
; GFX7-LABEL: flat_system_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -397,10 +364,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX10-WGP-LABEL: flat_system_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -417,10 +380,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX10-CU-LABEL: flat_system_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -451,8 +410,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -467,8 +424,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -568,7 +523,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in acquire, align 4
store i32 %val, ptr %out
@@ -578,9 +533,6 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX7-LABEL: flat_system_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -597,10 +549,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -619,10 +567,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX10-CU-LABEL: flat_system_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -656,8 +600,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -673,8 +615,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -793,7 +733,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in seq_cst, align 4
store i32 %val, ptr %out
@@ -803,9 +743,6 @@ entry:
define amdgpu_kernel void @flat_system_unordered_store(
; GFX7-LABEL: flat_system_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -817,10 +754,6 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX10-WGP-LABEL: flat_system_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -832,10 +765,6 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX10-CU-LABEL: flat_system_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -858,8 +787,6 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -870,8 +797,6 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -943,7 +868,7 @@ define amdgpu_kernel void @flat_system_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out unordered, align 4
ret void
@@ -952,9 +877,6 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_store(
; GFX7-LABEL: flat_system_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -966,10 +888,6 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX10-WGP-LABEL: flat_system_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -981,10 +899,6 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX10-CU-LABEL: flat_system_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1007,8 +921,6 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1019,8 +931,6 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1092,7 +1002,7 @@ define amdgpu_kernel void @flat_system_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out monotonic, align 4
ret void
@@ -1101,9 +1011,6 @@ entry:
define amdgpu_kernel void @flat_system_release_store(
; GFX7-LABEL: flat_system_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1116,10 +1023,6 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX10-WGP-LABEL: flat_system_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1133,10 +1036,6 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX10-CU-LABEL: flat_system_release_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1162,8 +1061,6 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1176,8 +1073,6 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1269,7 +1164,7 @@ define amdgpu_kernel void @flat_system_release_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out release, align 4
ret void
@@ -1278,9 +1173,6 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX7-LABEL: flat_system_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1293,10 +1185,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1310,10 +1198,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX10-CU-LABEL: flat_system_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1339,8 +1223,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1353,8 +1235,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1446,7 +1326,7 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out seq_cst, align 4
ret void
@@ -1455,9 +1335,6 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX7-LABEL: flat_system_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1469,10 +1346,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1484,10 +1357,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1510,8 +1379,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1522,8 +1389,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1595,7 +1460,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in monotonic
ret void
@@ -1604,9 +1469,6 @@ entry:
define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX7-LABEL: flat_system_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1620,10 +1482,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1639,10 +1497,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1670,8 +1524,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1685,8 +1537,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1777,7 +1627,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
ret void
@@ -1786,9 +1636,6 @@ entry:
define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX7-LABEL: flat_system_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1801,10 +1648,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1818,10 +1661,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1847,8 +1686,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1861,8 +1698,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1954,7 +1789,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in release
ret void
@@ -1963,9 +1798,6 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX7-LABEL: flat_system_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1980,10 +1812,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2001,10 +1829,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2035,8 +1859,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2052,8 +1874,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2164,7 +1984,7 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
ret void
@@ -2173,9 +1993,6 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX7-LABEL: flat_system_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2190,10 +2007,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2211,10 +2024,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2245,8 +2054,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2262,8 +2069,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2374,7 +2179,7 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
ret void
@@ -2383,9 +2188,6 @@ entry:
define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2402,10 +2204,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2423,10 +2221,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2459,8 +2253,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2476,8 +2268,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2584,7 +2374,7 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
store i32 %val, ptr %out, align 4
@@ -2594,9 +2384,6 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2614,10 +2401,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2637,10 +2420,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2676,8 +2455,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2695,8 +2472,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2827,7 +2602,7 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
store i32 %val, ptr %out, align 4
@@ -2837,9 +2612,6 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2857,10 +2629,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2880,10 +2648,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2919,8 +2683,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2938,8 +2700,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -3070,7 +2830,7 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
store i32 %val, ptr %out, align 4
@@ -3080,9 +2840,6 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3108,10 +2865,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3137,10 +2890,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3191,8 +2940,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3207,8 +2954,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3308,7 +3053,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic monotonic
@@ -3318,9 +3063,6 @@ entry:
define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3348,10 +3090,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3381,10 +3119,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3440,8 +3174,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3459,8 +3191,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3579,7 +3309,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire monotonic
@@ -3589,9 +3319,6 @@ entry:
define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3618,10 +3345,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3649,10 +3372,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3706,8 +3425,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3724,8 +3441,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3845,7 +3560,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release monotonic
@@ -3855,9 +3570,6 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3886,10 +3598,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3921,10 +3629,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3983,8 +3687,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4004,8 +3706,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4144,7 +3844,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel monotonic
@@ -4154,9 +3854,6 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4185,10 +3882,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4220,10 +3913,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4282,8 +3971,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4303,8 +3990,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4443,7 +4128,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst monotonic
@@ -4453,9 +4138,6 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4483,10 +4165,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4516,10 +4194,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4575,8 +4249,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4594,8 +4266,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4714,7 +4384,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic acquire
@@ -4724,9 +4394,6 @@ entry:
define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4754,10 +4421,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4787,10 +4450,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4846,8 +4505,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4865,8 +4522,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4985,7 +4640,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire acquire
@@ -4995,9 +4650,6 @@ entry:
define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX7-LABEL: flat_system_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5026,10 +4678,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5061,10 +4709,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5123,8 +4767,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5144,8 +4786,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5284,7 +4924,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release acquire
@@ -5294,9 +4934,6 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5325,10 +4962,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5360,10 +4993,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5422,8 +5051,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5443,8 +5070,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5583,7 +5208,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel acquire
@@ -5593,9 +5218,6 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5624,10 +5246,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5659,10 +5277,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5721,8 +5335,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5742,8 +5354,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5882,7 +5492,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst acquire
@@ -5892,9 +5502,6 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5923,10 +5530,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5958,10 +5561,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6020,8 +5619,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6041,8 +5638,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6181,7 +5776,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic seq_cst
@@ -6191,9 +5786,6 @@ entry:
define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6222,10 +5814,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6257,10 +5845,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6319,8 +5903,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6340,8 +5922,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6480,7 +6060,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire seq_cst
@@ -6490,9 +6070,6 @@ entry:
define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6521,10 +6098,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6556,10 +6129,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6618,8 +6187,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6639,8 +6206,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6779,7 +6344,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release seq_cst
@@ -6789,9 +6354,6 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6820,10 +6382,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6855,10 +6413,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6917,8 +6471,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6938,8 +6490,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7078,7 +6628,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel seq_cst
@@ -7088,9 +6638,6 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -7119,10 +6666,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -7154,10 +6697,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -7216,8 +6755,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7237,8 +6774,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7377,7 +6912,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -7387,9 +6922,6 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7419,10 +6951,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7452,10 +6980,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7514,8 +7038,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7533,8 +7055,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7659,7 +7179,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic monotonic
@@ -7671,9 +7191,6 @@ entry:
define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7704,10 +7221,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7739,10 +7252,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7803,8 +7312,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7824,8 +7331,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7960,7 +7465,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire monotonic
@@ -7972,9 +7477,6 @@ entry:
define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8005,10 +7507,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8040,10 +7538,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8105,8 +7599,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8126,8 +7618,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8272,7 +7762,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release monotonic
@@ -8284,9 +7774,6 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8318,10 +7805,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8355,10 +7838,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8422,8 +7901,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8445,8 +7922,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8605,7 +8080,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel monotonic
@@ -8617,9 +8092,6 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8651,10 +8123,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8688,10 +8156,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8755,8 +8219,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8778,8 +8240,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8938,7 +8398,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst monotonic
@@ -8950,9 +8410,6 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8983,10 +8440,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9018,10 +8471,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9082,8 +8531,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9103,8 +8550,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9243,7 +8688,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic acquire
@@ -9255,9 +8700,6 @@ entry:
define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9288,10 +8730,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9323,10 +8761,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9387,8 +8821,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9408,8 +8840,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9544,7 +8974,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire acquire
@@ -9556,9 +8986,6 @@ entry:
define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9590,10 +9017,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9627,10 +9050,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9694,8 +9113,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9717,8 +9134,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9877,7 +9292,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release acquire
@@ -9889,9 +9304,6 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9923,10 +9335,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9960,10 +9368,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10027,8 +9431,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10050,8 +9452,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10210,7 +9610,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel acquire
@@ -10222,9 +9622,6 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10256,10 +9653,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10293,10 +9686,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10360,8 +9749,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10383,8 +9770,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10543,7 +9928,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst acquire
@@ -10555,9 +9940,6 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10589,10 +9971,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10626,10 +10004,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10693,8 +10067,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10716,8 +10088,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10876,7 +10246,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic seq_cst
@@ -10888,9 +10258,6 @@ entry:
define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10922,10 +10289,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10959,10 +10322,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11026,8 +10385,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11049,8 +10406,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11205,7 +10560,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire seq_cst
@@ -11217,9 +10572,6 @@ entry:
define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11251,10 +10603,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11288,10 +10636,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11355,8 +10699,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11378,8 +10720,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11538,7 +10878,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release seq_cst
@@ -11550,9 +10890,6 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11584,10 +10921,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11621,10 +10954,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11688,8 +11017,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11711,8 +11038,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11871,7 +11196,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel seq_cst
@@ -11883,9 +11208,6 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11917,10 +11239,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11954,10 +11272,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -12021,8 +11335,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12044,8 +11356,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12204,7 +11514,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -12216,9 +11526,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX7-LABEL: flat_system_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12233,10 +11540,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12251,10 +11554,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_system_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12283,8 +11582,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12297,8 +11594,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12388,7 +11683,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") unordered, align 4
store i32 %val, ptr %out
@@ -12398,9 +11693,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX7-LABEL: flat_system_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12415,10 +11707,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12433,10 +11721,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12465,8 +11749,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12479,8 +11761,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12570,7 +11850,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") monotonic, align 4
store i32 %val, ptr %out
@@ -12580,9 +11860,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX7-LABEL: flat_system_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12599,10 +11876,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12620,10 +11893,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12656,8 +11925,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12673,8 +11940,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12779,7 +12044,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") acquire, align 4
store i32 %val, ptr %out
@@ -12789,9 +12054,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX7-LABEL: flat_system_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12809,10 +12071,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12832,10 +12090,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12871,8 +12125,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12889,8 +12141,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13014,7 +12264,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4
store i32 %val, ptr %out
@@ -13024,9 +12274,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX7-LABEL: flat_system_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13038,10 +12285,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13053,10 +12296,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_system_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13079,8 +12318,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13091,8 +12328,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13164,7 +12399,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4
ret void
@@ -13173,9 +12408,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX7-LABEL: flat_system_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13187,10 +12419,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13202,10 +12430,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13228,8 +12452,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13240,8 +12462,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13313,7 +12533,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4
ret void
@@ -13322,9 +12542,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX7-LABEL: flat_system_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13337,10 +12554,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13354,10 +12567,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX10-CU-LABEL: flat_system_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13383,8 +12592,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13397,8 +12604,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13490,7 +12695,7 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") release, align 4
ret void
@@ -13499,9 +12704,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX7-LABEL: flat_system_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13514,10 +12716,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13531,10 +12729,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13560,8 +12754,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13574,8 +12766,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13667,7 +12857,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4
ret void
@@ -13676,9 +12866,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13690,10 +12877,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13705,10 +12888,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13731,8 +12910,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13743,8 +12920,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13816,7 +12991,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic
ret void
@@ -13825,9 +13000,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13841,10 +13013,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13859,10 +13027,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13889,8 +13053,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13904,8 +13066,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13994,7 +13154,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
ret void
@@ -14003,9 +13163,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX7-LABEL: flat_system_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14018,10 +13175,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14035,10 +13188,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14064,8 +13213,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14078,8 +13225,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14171,7 +13316,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release
ret void
@@ -14180,9 +13325,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14197,10 +13339,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14217,10 +13355,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14250,8 +13384,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14267,8 +13399,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14377,7 +13507,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
ret void
@@ -14386,9 +13516,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14403,10 +13530,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14423,10 +13546,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14456,8 +13575,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14473,8 +13590,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14583,7 +13698,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
ret void
@@ -14592,9 +13707,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14612,10 +13724,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14634,10 +13742,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14672,8 +13776,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14690,8 +13792,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14803,7 +13903,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
store i32 %val, ptr %out, align 4
@@ -14813,9 +13913,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14834,10 +13931,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14858,10 +13951,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14899,8 +13988,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14919,8 +14006,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -15056,7 +14141,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
store i32 %val, ptr %out, align 4
@@ -15066,9 +14151,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -15087,10 +14169,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -15111,10 +14189,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -15152,8 +14226,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -15172,8 +14244,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -15309,7 +14379,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
store i32 %val, ptr %out, align 4
@@ -15319,9 +14389,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15347,10 +14414,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15376,10 +14439,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15430,8 +14489,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15446,8 +14503,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15547,7 +14602,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
@@ -15557,9 +14612,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15587,10 +14639,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15619,10 +14667,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15677,8 +14721,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15696,8 +14738,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15814,7 +14854,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
@@ -15824,9 +14864,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15853,10 +14890,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15884,10 +14917,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15941,8 +14970,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15959,8 +14986,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16080,7 +15105,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
@@ -16090,9 +15115,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16121,10 +15143,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16155,10 +15173,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16216,8 +15230,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16237,8 +15249,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16375,7 +15385,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
@@ -16385,9 +15395,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16416,10 +15423,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16450,10 +15453,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16511,8 +15510,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16532,8 +15529,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16670,7 +15665,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
@@ -16680,9 +15675,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16710,10 +15702,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16742,10 +15730,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16800,8 +15784,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16819,8 +15801,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16937,7 +15917,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
@@ -16947,9 +15927,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16977,10 +15954,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17009,10 +15982,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17067,8 +16036,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17086,8 +16053,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17204,7 +16169,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
@@ -17214,9 +16179,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17245,10 +16207,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17279,10 +16237,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17340,8 +16294,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17361,8 +16313,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17499,7 +16449,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release acquire
@@ -17509,9 +16459,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17540,10 +16487,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17574,10 +16517,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17635,8 +16574,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17656,8 +16593,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17794,7 +16729,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
@@ -17804,9 +16739,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17835,10 +16767,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17869,10 +16797,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17930,8 +16854,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17951,8 +16873,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18089,7 +17009,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
@@ -18099,9 +17019,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18130,10 +17047,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18164,10 +17077,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18225,8 +17134,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18246,8 +17153,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18384,7 +17289,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
@@ -18394,9 +17299,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18425,10 +17327,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18459,10 +17357,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18520,8 +17414,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18541,8 +17433,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18679,7 +17569,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
@@ -18689,9 +17579,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18720,10 +17607,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18754,10 +17637,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18815,8 +17694,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18836,8 +17713,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18974,7 +17849,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
@@ -18984,9 +17859,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -19015,10 +17887,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -19049,10 +17917,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -19110,8 +17974,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19131,8 +17993,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19269,7 +18129,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
@@ -19279,9 +18139,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -19310,10 +18167,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -19344,10 +18197,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -19405,8 +18254,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19426,8 +18273,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19564,7 +18409,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
@@ -19574,9 +18419,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19606,10 +18448,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19639,10 +18477,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19701,8 +18535,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19720,8 +18552,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19846,7 +18676,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
@@ -19858,9 +18688,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19892,10 +18719,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19928,10 +18751,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19994,8 +18813,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20016,8 +18833,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20157,7 +18972,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
@@ -20169,9 +18984,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20202,10 +19014,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20237,10 +19045,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20302,8 +19106,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20323,8 +19125,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20469,7 +19269,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
@@ -20481,9 +19281,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20516,10 +19313,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20554,10 +19347,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20623,8 +19412,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20647,8 +19434,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20812,7 +19597,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
@@ -20824,9 +19609,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20859,10 +19641,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20897,10 +19675,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20966,8 +19740,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20990,8 +19762,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21155,7 +19925,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
@@ -21167,9 +19937,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21201,10 +19968,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21237,10 +20000,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21303,8 +20062,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21325,8 +20082,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21470,7 +20225,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
@@ -21482,9 +20237,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21516,10 +20268,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21552,10 +20300,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21618,8 +20362,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21640,8 +20382,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21781,7 +20521,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
@@ -21793,9 +20533,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21828,10 +20565,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21866,10 +20599,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21935,8 +20664,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21959,8 +20686,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22124,7 +20849,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release acquire
@@ -22136,9 +20861,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22171,10 +20893,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22209,10 +20927,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22278,8 +20992,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22302,8 +21014,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22467,7 +21177,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
@@ -22479,9 +21189,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22514,10 +21221,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22552,10 +21255,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22621,8 +21320,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22645,8 +21342,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22810,7 +21505,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
@@ -22822,9 +21517,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22857,10 +21549,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22895,10 +21583,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22964,8 +21648,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22988,8 +21670,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -23153,7 +21833,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
@@ -23165,9 +21845,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -23200,10 +21877,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -23238,10 +21911,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -23307,8 +21976,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -23331,8 +21998,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -23492,7 +22157,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
@@ -23504,9 +22169,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -23539,10 +22201,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -23577,10 +22235,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -23646,8 +22300,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -23670,8 +22322,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -23835,7 +22485,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
@@ -23847,9 +22497,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -23882,10 +22529,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -23920,10 +22563,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -23989,8 +22628,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -24013,8 +22650,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -24178,7 +22813,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
@@ -24190,9 +22825,6 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -24225,10 +22857,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -24263,10 +22891,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -24332,8 +22956,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -24356,8 +22978,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -24521,7 +23141,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
@@ -24529,3 +23149,5 @@ entry:
store i32 %val0, ptr %out, align 4
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index a88e0e217fdb4..ed4292454913e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -11,9 +11,6 @@
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,10 +26,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -48,10 +41,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-CU-LABEL: flat_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -143,7 +132,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load volatile i32, ptr %in, align 4
store i32 %val, ptr %out
@@ -153,9 +142,6 @@ entry:
define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX7-LABEL: flat_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -186,10 +172,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
@@ -220,10 +202,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-CU-LABEL: flat_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
@@ -415,7 +393,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr %in, i32 %tid
@@ -427,9 +405,6 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX7-LABEL: flat_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -445,10 +420,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -464,10 +435,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-CU-LABEL: flat_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -563,7 +530,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load i32, ptr %in, align 4
store volatile i32 %val, ptr %out
@@ -573,9 +540,6 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX7-LABEL: flat_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -605,10 +569,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -637,10 +597,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-CU-LABEL: flat_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -831,7 +787,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr %in, align 4
@@ -843,9 +799,6 @@ entry:
define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -861,10 +814,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
;
; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -880,10 +829,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
;
; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -971,7 +916,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic volatile i32, ptr %in syncscope("workgroup") acquire, align 4
store i32 %val, ptr %out
@@ -981,9 +926,6 @@ entry:
define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX7-LABEL: flat_volatile_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -996,10 +938,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
;
; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1013,10 +951,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
;
; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1090,10 +1024,12 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index 7c637a20ab47b..34911b17657bb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -15,9 +15,6 @@
define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX7-LABEL: flat_wavefront_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -32,10 +29,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX10-WGP-LABEL: flat_wavefront_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -50,10 +43,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX10-CU-LABEL: flat_wavefront_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -82,8 +71,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -96,8 +83,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -187,7 +172,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront") unordered, align 4
store i32 %val, ptr %out
@@ -197,9 +182,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX7-LABEL: flat_wavefront_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -214,10 +196,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -232,10 +210,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -264,8 +238,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -278,8 +250,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -369,7 +339,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront") monotonic, align 4
store i32 %val, ptr %out
@@ -379,9 +349,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX7-LABEL: flat_wavefront_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -396,10 +363,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -414,10 +377,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -446,8 +405,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -460,8 +417,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -551,7 +506,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront") acquire, align 4
store i32 %val, ptr %out
@@ -561,9 +516,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX7-LABEL: flat_wavefront_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -578,10 +530,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -596,10 +544,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -628,8 +572,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -642,8 +584,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -733,7 +673,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront") seq_cst, align 4
store i32 %val, ptr %out
@@ -743,9 +683,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX7-LABEL: flat_wavefront_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -757,10 +694,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX10-WGP-LABEL: flat_wavefront_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -772,10 +705,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX10-CU-LABEL: flat_wavefront_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -798,8 +727,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -810,8 +737,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -883,7 +808,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4
ret void
@@ -892,9 +817,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX7-LABEL: flat_wavefront_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -906,10 +828,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -921,10 +839,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -947,8 +861,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -959,8 +871,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1032,7 +942,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4
ret void
@@ -1041,9 +951,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_store(
; GFX7-LABEL: flat_wavefront_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1055,10 +962,6 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX10-WGP-LABEL: flat_wavefront_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1070,10 +973,6 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX10-CU-LABEL: flat_wavefront_release_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1096,8 +995,6 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1108,8 +1005,6 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1181,7 +1076,7 @@ define amdgpu_kernel void @flat_wavefront_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4
ret void
@@ -1190,9 +1085,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX7-LABEL: flat_wavefront_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1204,10 +1096,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1219,10 +1107,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1245,8 +1129,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1257,8 +1139,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1330,7 +1210,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4
ret void
@@ -1339,9 +1219,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1353,10 +1230,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1368,10 +1241,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1394,8 +1263,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1406,8 +1273,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1479,7 +1344,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic
ret void
@@ -1488,9 +1353,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX7-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1502,10 +1364,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1517,10 +1375,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1543,8 +1397,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1555,8 +1407,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1628,7 +1478,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
ret void
@@ -1637,9 +1487,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX7-LABEL: flat_wavefront_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1651,10 +1498,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1666,10 +1509,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1692,8 +1531,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1704,8 +1541,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1777,7 +1612,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release
ret void
@@ -1786,9 +1621,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1800,10 +1632,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1815,10 +1643,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1841,8 +1665,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1853,8 +1675,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1926,7 +1746,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
ret void
@@ -1935,9 +1755,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1949,10 +1766,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1964,10 +1777,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1990,8 +1799,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2002,8 +1809,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2075,7 +1880,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
ret void
@@ -2084,9 +1889,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2102,10 +1904,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2121,10 +1919,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2155,8 +1949,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2170,8 +1962,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2268,7 +2058,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
store i32 %val, ptr %out, align 4
@@ -2278,9 +2068,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2296,10 +2083,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2315,10 +2098,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2349,8 +2128,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2364,8 +2141,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2462,7 +2237,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
store i32 %val, ptr %out, align 4
@@ -2472,9 +2247,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2490,10 +2262,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2509,10 +2277,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2543,8 +2307,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2558,8 +2320,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2656,7 +2416,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
store i32 %val, ptr %out, align 4
@@ -2666,9 +2426,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2694,10 +2451,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2723,10 +2476,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2777,8 +2526,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2793,8 +2540,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2894,7 +2639,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
@@ -2904,9 +2649,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2932,10 +2674,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2961,10 +2699,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3015,8 +2749,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3031,8 +2763,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3132,7 +2862,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
@@ -3142,9 +2872,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3170,10 +2897,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3199,10 +2922,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3253,8 +2972,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3269,8 +2986,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3370,7 +3085,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
@@ -3380,9 +3095,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3408,10 +3120,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3437,10 +3145,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3491,8 +3195,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3507,8 +3209,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3608,7 +3308,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
@@ -3618,9 +3318,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3646,10 +3343,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3675,10 +3368,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3729,8 +3418,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3745,8 +3432,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3846,7 +3531,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
@@ -3856,9 +3541,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3884,10 +3566,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3913,10 +3591,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3967,8 +3641,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3983,8 +3655,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4084,7 +3754,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
@@ -4094,9 +3764,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4122,10 +3789,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4151,10 +3814,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4205,8 +3864,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4221,8 +3878,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4322,7 +3977,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
@@ -4332,9 +3987,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4360,10 +4012,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4389,10 +4037,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4443,8 +4087,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4459,8 +4101,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4560,7 +4200,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
@@ -4570,9 +4210,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4598,10 +4235,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4627,10 +4260,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4681,8 +4310,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4697,8 +4324,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4798,7 +4423,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
@@ -4808,9 +4433,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4836,10 +4458,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4865,10 +4483,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4919,8 +4533,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4935,8 +4547,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5036,7 +4646,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
@@ -5046,9 +4656,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5074,10 +4681,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5103,10 +4706,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5157,8 +4756,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5173,8 +4770,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5274,7 +4869,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
@@ -5284,9 +4879,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5312,10 +4904,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5341,10 +4929,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5395,8 +4979,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5411,8 +4993,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5512,7 +5092,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
@@ -5522,9 +5102,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5550,10 +5127,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5579,10 +5152,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5633,8 +5202,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5649,8 +5216,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5750,7 +5315,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
@@ -5760,9 +5325,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5788,10 +5350,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5817,10 +5375,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5871,8 +5425,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5887,8 +5439,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5988,7 +5538,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
@@ -5998,9 +5548,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6026,10 +5573,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6055,10 +5598,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6109,8 +5648,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6125,8 +5662,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6226,7 +5761,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
@@ -6236,9 +5771,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6268,10 +5800,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6301,10 +5829,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6363,8 +5887,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6382,8 +5904,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6508,7 +6028,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
@@ -6520,9 +6040,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6552,10 +6069,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6585,10 +6098,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6647,8 +6156,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6666,8 +6173,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6792,7 +6297,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
@@ -6804,9 +6309,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6836,10 +6338,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6869,10 +6367,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6931,8 +6425,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6950,8 +6442,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7076,7 +6566,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
@@ -7088,9 +6578,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7120,10 +6607,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7153,10 +6636,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7215,8 +6694,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7234,8 +6711,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7360,7 +6835,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
@@ -7372,9 +6847,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7404,10 +6876,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7437,10 +6905,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7499,8 +6963,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7518,8 +6980,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7644,7 +7104,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
@@ -7656,9 +7116,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7688,10 +7145,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7721,10 +7174,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7783,8 +7232,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7802,8 +7249,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7928,7 +7373,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
@@ -7940,9 +7385,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7972,10 +7414,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8005,10 +7443,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8067,8 +7501,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8086,8 +7518,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8212,7 +7642,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
@@ -8224,9 +7654,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8256,10 +7683,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8289,10 +7712,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8351,8 +7770,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8370,8 +7787,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8496,7 +7911,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
@@ -8508,9 +7923,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8540,10 +7952,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8573,10 +7981,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8635,8 +8039,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8654,8 +8056,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8780,7 +8180,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
@@ -8792,9 +8192,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8824,10 +8221,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8857,10 +8250,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8919,8 +8308,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8938,8 +8325,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9064,7 +8449,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
@@ -9076,9 +8461,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9108,10 +8490,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9141,10 +8519,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9203,8 +8577,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9222,8 +8594,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9348,7 +8718,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
@@ -9360,9 +8730,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9392,10 +8759,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9425,10 +8788,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9487,8 +8846,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9506,8 +8863,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9632,7 +8987,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
@@ -9644,9 +8999,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9676,10 +9028,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9709,10 +9057,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9771,8 +9115,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9790,8 +9132,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9916,7 +9256,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
@@ -9928,9 +9268,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9960,10 +9297,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9993,10 +9326,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10055,8 +9384,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10074,8 +9401,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10200,7 +9525,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
@@ -10212,9 +9537,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10244,10 +9566,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10277,10 +9595,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10339,8 +9653,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10358,8 +9670,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10484,7 +9794,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
@@ -10496,9 +9806,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX7-LABEL: flat_wavefront_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10513,10 +9820,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10531,10 +9834,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10563,8 +9862,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10577,8 +9874,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10668,7 +9963,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront-one-as") unordered, align 4
store i32 %val, ptr %out
@@ -10678,9 +9973,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10695,10 +9987,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10713,10 +10001,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10745,8 +10029,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10759,8 +10041,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10850,7 +10130,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront-one-as") monotonic, align 4
store i32 %val, ptr %out
@@ -10860,9 +10140,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX7-LABEL: flat_wavefront_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10877,10 +10154,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10895,10 +10168,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10927,8 +10196,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10941,8 +10208,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11032,7 +10297,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront-one-as") acquire, align 4
store i32 %val, ptr %out
@@ -11042,9 +10307,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11059,10 +10321,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11077,10 +10335,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11109,8 +10363,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11123,8 +10375,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11214,7 +10464,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront-one-as") seq_cst, align 4
store i32 %val, ptr %out
@@ -11224,9 +10474,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX7-LABEL: flat_wavefront_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11238,10 +10485,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11253,10 +10496,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11279,8 +10518,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11291,8 +10528,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11364,7 +10599,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4
ret void
@@ -11373,9 +10608,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11387,10 +10619,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11402,10 +10630,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11428,8 +10652,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11440,8 +10662,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11513,7 +10733,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4
ret void
@@ -11522,9 +10742,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX7-LABEL: flat_wavefront_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11536,10 +10753,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11551,10 +10764,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11577,8 +10786,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11589,8 +10796,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11662,7 +10867,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4
ret void
@@ -11671,9 +10876,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11685,10 +10887,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11700,10 +10898,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11726,8 +10920,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11738,8 +10930,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11811,7 +11001,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4
ret void
@@ -11820,9 +11010,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11834,10 +11021,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11849,10 +11032,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11875,8 +11054,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11887,8 +11064,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11960,7 +11135,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic
ret void
@@ -11969,9 +11144,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11983,10 +11155,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11998,10 +11166,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12024,8 +11188,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12036,8 +11198,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12109,7 +11269,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
ret void
@@ -12118,9 +11278,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12132,10 +11289,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12147,10 +11300,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12173,8 +11322,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12185,8 +11332,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12258,7 +11403,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release
ret void
@@ -12267,9 +11412,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12281,10 +11423,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12296,10 +11434,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12322,8 +11456,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12334,8 +11466,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12407,7 +11537,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
ret void
@@ -12416,9 +11546,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12430,10 +11557,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12445,10 +11568,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12471,8 +11590,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12483,8 +11600,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12556,7 +11671,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
ret void
@@ -12565,9 +11680,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12583,10 +11695,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12602,10 +11710,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12636,8 +11740,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12651,8 +11753,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12749,7 +11849,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
store i32 %val, ptr %out, align 4
@@ -12759,9 +11859,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12777,10 +11874,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12796,10 +11889,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12830,8 +11919,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12845,8 +11932,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12943,7 +12028,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
store i32 %val, ptr %out, align 4
@@ -12953,9 +12038,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12971,10 +12053,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12990,10 +12068,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13024,8 +12098,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13039,8 +12111,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13137,7 +12207,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
store i32 %val, ptr %out, align 4
@@ -13147,9 +12217,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13175,10 +12242,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13204,10 +12267,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13258,8 +12317,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13274,8 +12331,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13375,7 +12430,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
@@ -13385,9 +12440,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13413,10 +12465,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13442,10 +12490,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13496,8 +12540,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13512,8 +12554,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13613,7 +12653,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
@@ -13623,9 +12663,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13651,10 +12688,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13680,10 +12713,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13734,8 +12763,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13750,8 +12777,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13851,7 +12876,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
@@ -13861,9 +12886,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13889,10 +12911,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13918,10 +12936,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13972,8 +12986,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13988,8 +13000,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14089,7 +13099,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
@@ -14099,9 +13109,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14127,10 +13134,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14156,10 +13159,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14210,8 +13209,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14226,8 +13223,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14327,7 +13322,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
@@ -14337,9 +13332,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14365,10 +13357,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14394,10 +13382,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14448,8 +13432,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14464,8 +13446,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14565,7 +13545,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
@@ -14575,9 +13555,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14603,10 +13580,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14632,10 +13605,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14686,8 +13655,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14702,8 +13669,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14803,7 +13768,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
@@ -14813,9 +13778,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14841,10 +13803,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14870,10 +13828,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14924,8 +13878,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14940,8 +13892,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15041,7 +13991,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
@@ -15051,9 +14001,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15079,10 +14026,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15108,10 +14051,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15162,8 +14101,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15178,8 +14115,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15279,7 +14214,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
@@ -15289,9 +14224,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15317,10 +14249,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15346,10 +14274,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15400,8 +14324,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15416,8 +14338,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15517,7 +14437,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
@@ -15527,9 +14447,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15555,10 +14472,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15584,10 +14497,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15638,8 +14547,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15654,8 +14561,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15755,7 +14660,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
@@ -15765,9 +14670,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15793,10 +14695,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15822,10 +14720,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15876,8 +14770,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15892,8 +14784,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15993,7 +14883,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
@@ -16003,9 +14893,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16031,10 +14918,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16060,10 +14943,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16114,8 +14993,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16130,8 +15007,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16231,7 +15106,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
@@ -16241,9 +15116,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16269,10 +15141,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16298,10 +15166,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16352,8 +15216,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16368,8 +15230,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16469,7 +15329,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
@@ -16479,9 +15339,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16507,10 +15364,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16536,10 +15389,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16590,8 +15439,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16606,8 +15453,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16707,7 +15552,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
@@ -16717,9 +15562,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16749,10 +15591,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16782,10 +15620,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16844,8 +15678,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16863,8 +15695,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16989,7 +15819,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
@@ -17001,9 +15831,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17033,10 +15860,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17066,10 +15889,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17128,8 +15947,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17147,8 +15964,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17273,7 +16088,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
@@ -17285,9 +16100,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17317,10 +16129,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17350,10 +16158,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17412,8 +16216,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17431,8 +16233,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17557,7 +16357,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
@@ -17569,9 +16369,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17601,10 +16398,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17634,10 +16427,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17696,8 +16485,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17715,8 +16502,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17841,7 +16626,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
@@ -17853,9 +16638,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17885,10 +16667,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17918,10 +16696,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17980,8 +16754,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17999,8 +16771,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18125,7 +16895,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
@@ -18137,9 +16907,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18169,10 +16936,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18202,10 +16965,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18264,8 +17023,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18283,8 +17040,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18409,7 +17164,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
@@ -18421,9 +17176,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18453,10 +17205,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18486,10 +17234,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18548,8 +17292,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18567,8 +17309,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18693,7 +17433,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
@@ -18705,9 +17445,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18737,10 +17474,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18770,10 +17503,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18832,8 +17561,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18851,8 +17578,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18977,7 +17702,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
@@ -18989,9 +17714,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19021,10 +17743,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19054,10 +17772,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19116,8 +17830,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19135,8 +17847,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19261,7 +17971,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
@@ -19273,9 +17983,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19305,10 +18012,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19338,10 +18041,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19400,8 +18099,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19419,8 +18116,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19545,7 +18240,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
@@ -19557,9 +18252,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19589,10 +18281,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19622,10 +18310,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19684,8 +18368,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19703,8 +18385,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19829,7 +18509,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
@@ -19841,9 +18521,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19873,10 +18550,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19906,10 +18579,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19968,8 +18637,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19987,8 +18654,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20113,7 +18778,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
@@ -20125,9 +18790,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20157,10 +18819,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20190,10 +18848,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20252,8 +18906,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20271,8 +18923,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20397,7 +19047,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
@@ -20409,9 +19059,6 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20441,10 +19088,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20474,10 +19117,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20536,8 +19175,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20555,8 +19192,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20681,7 +19316,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
@@ -20689,3 +19324,5 @@ entry:
store i32 %val0, ptr %out, align 4
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 0fd4aa4a7a93f..915135224b982 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -15,9 +15,6 @@
define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX7-LABEL: flat_workgroup_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -32,10 +29,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -50,10 +43,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX10-CU-LABEL: flat_workgroup_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -82,8 +71,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -96,8 +83,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -187,7 +172,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") unordered, align 4
store i32 %val, ptr %out
@@ -197,9 +182,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX7-LABEL: flat_workgroup_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -214,10 +196,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -232,10 +210,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -264,8 +238,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -278,8 +250,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -369,7 +339,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") monotonic, align 4
store i32 %val, ptr %out
@@ -379,9 +349,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX7-LABEL: flat_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -397,10 +364,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -416,10 +379,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -450,8 +409,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -465,8 +422,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -563,7 +518,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") acquire, align 4
store i32 %val, ptr %out
@@ -573,9 +528,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX7-LABEL: flat_workgroup_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -592,10 +544,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -613,10 +561,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -649,8 +593,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -665,8 +607,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -776,7 +716,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4
store i32 %val, ptr %out
@@ -786,9 +726,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX7-LABEL: flat_workgroup_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -800,10 +737,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -815,10 +748,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX10-CU-LABEL: flat_workgroup_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -841,8 +770,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -853,8 +780,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -926,7 +851,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4
ret void
@@ -935,9 +860,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX7-LABEL: flat_workgroup_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -949,10 +871,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -964,10 +882,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -990,8 +904,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1002,8 +914,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1075,7 +985,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4
ret void
@@ -1084,9 +994,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_store(
; GFX7-LABEL: flat_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1099,10 +1006,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX10-WGP-LABEL: flat_workgroup_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1116,10 +1019,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX10-CU-LABEL: flat_workgroup_release_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1144,8 +1043,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1157,8 +1054,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1241,7 +1136,7 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4
ret void
@@ -1250,9 +1145,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX7-LABEL: flat_workgroup_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1265,10 +1157,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1282,10 +1170,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1310,8 +1194,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1323,8 +1205,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1407,7 +1287,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4
ret void
@@ -1416,9 +1296,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1430,10 +1307,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1445,10 +1318,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1471,8 +1340,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1483,8 +1350,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1556,7 +1421,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic
ret void
@@ -1565,9 +1430,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1580,10 +1442,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1598,10 +1456,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1626,8 +1480,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1639,8 +1491,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1724,7 +1574,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
ret void
@@ -1733,9 +1583,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX7-LABEL: flat_workgroup_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1748,10 +1595,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1765,10 +1608,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1793,8 +1632,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1806,8 +1643,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1890,7 +1725,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release
ret void
@@ -1899,9 +1734,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1915,10 +1747,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1935,10 +1763,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1965,8 +1789,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1979,8 +1801,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2075,7 +1895,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
ret void
@@ -2084,9 +1904,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2100,10 +1917,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2120,10 +1933,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2150,8 +1959,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2164,8 +1971,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2260,7 +2065,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
ret void
@@ -2269,9 +2074,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2288,10 +2090,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2308,10 +2106,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2344,8 +2138,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2360,8 +2152,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2465,7 +2255,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
store i32 %val, ptr %out, align 4
@@ -2475,9 +2265,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2495,10 +2282,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2517,10 +2300,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2555,8 +2334,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2572,8 +2349,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2690,7 +2465,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
store i32 %val, ptr %out, align 4
@@ -2700,9 +2475,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2720,10 +2492,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2742,10 +2510,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2780,8 +2544,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2797,8 +2559,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2915,7 +2675,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out, align 4
@@ -2925,9 +2685,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2953,10 +2710,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2982,10 +2735,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3036,8 +2785,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3052,8 +2799,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3153,7 +2898,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
@@ -3163,9 +2908,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3192,10 +2934,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3224,10 +2962,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3280,8 +3014,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3297,8 +3029,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3410,7 +3140,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
@@ -3420,9 +3150,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3449,10 +3176,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3480,10 +3203,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3536,8 +3255,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3553,8 +3270,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3665,7 +3380,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
@@ -3675,9 +3390,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3705,10 +3417,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3739,10 +3447,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3797,8 +3501,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3815,8 +3517,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3939,7 +3639,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
@@ -3949,9 +3649,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3979,10 +3676,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4013,10 +3706,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4071,8 +3760,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4089,8 +3776,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4213,7 +3898,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
@@ -4223,9 +3908,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4252,10 +3934,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4284,10 +3962,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4340,8 +4014,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4357,8 +4029,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4470,7 +4140,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
@@ -4480,9 +4150,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4509,10 +4176,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4541,10 +4204,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4597,8 +4256,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4614,8 +4271,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4727,7 +4382,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
@@ -4737,9 +4392,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4767,10 +4419,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4801,10 +4449,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4859,8 +4503,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4877,8 +4519,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5001,7 +4641,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
@@ -5011,9 +4651,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5041,10 +4678,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5075,10 +4708,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5133,8 +4762,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5151,8 +4778,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5275,7 +4900,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
@@ -5285,9 +4910,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5315,10 +4937,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5349,10 +4967,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5407,8 +5021,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5425,8 +5037,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5549,7 +5159,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
@@ -5559,9 +5169,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5589,10 +5196,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5623,10 +5226,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5681,8 +5280,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5699,8 +5296,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5823,7 +5418,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
@@ -5833,9 +5428,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5865,10 +5457,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5898,10 +5486,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5960,8 +5544,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5979,8 +5561,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6105,7 +5685,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
@@ -6117,9 +5697,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6150,10 +5727,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6184,10 +5757,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6248,8 +5817,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6268,8 +5835,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6401,7 +5966,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
@@ -6413,9 +5978,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6446,10 +6008,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6481,10 +6039,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6545,8 +6099,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6565,8 +6117,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6702,7 +6252,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
@@ -6714,9 +6264,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6748,10 +6295,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6784,10 +6327,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6850,8 +6389,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6871,8 +6408,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7017,7 +6552,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
@@ -7029,9 +6564,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7063,10 +6595,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7099,10 +6627,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7165,8 +6689,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7186,8 +6708,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7332,7 +6852,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
@@ -7344,9 +6864,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7377,10 +6894,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7411,10 +6924,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7475,8 +6984,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7495,8 +7002,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7630,7 +7135,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
@@ -7642,9 +7147,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7675,10 +7177,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7709,10 +7207,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7773,8 +7267,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7793,8 +7285,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7926,7 +7416,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
@@ -7938,9 +7428,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7972,10 +7459,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8008,10 +7491,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8074,8 +7553,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8095,8 +7572,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8241,7 +7716,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
@@ -8253,9 +7728,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8287,10 +7759,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8323,10 +7791,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8389,8 +7853,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8410,8 +7872,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8556,7 +8016,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
@@ -8568,9 +8028,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8602,10 +8059,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8638,10 +8091,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8704,8 +8153,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8725,8 +8172,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8871,7 +8316,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
@@ -8883,9 +8328,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8917,10 +8359,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8953,10 +8391,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9019,8 +8453,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9040,8 +8472,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9186,7 +8616,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst
@@ -9198,9 +8628,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9232,10 +8659,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9268,10 +8691,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9334,8 +8753,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9355,8 +8772,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9499,7 +8914,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst
@@ -9511,9 +8926,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9545,10 +8957,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9581,10 +8989,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9647,8 +9051,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9668,8 +9070,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9814,7 +9214,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst
@@ -9826,9 +9226,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9860,10 +9257,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9896,10 +9289,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9962,8 +9351,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9983,8 +9370,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10129,7 +9514,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst
@@ -10141,9 +9526,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10175,10 +9557,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10211,10 +9589,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10277,8 +9651,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10298,8 +9670,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10444,7 +9814,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
@@ -10456,9 +9826,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10473,10 +9840,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10491,10 +9854,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10523,8 +9882,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10537,8 +9894,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10628,7 +9983,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") unordered, align 4
store i32 %val, ptr %out
@@ -10638,9 +9993,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10655,10 +10007,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10673,10 +10021,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10705,8 +10049,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10719,8 +10061,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10810,7 +10150,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") monotonic, align 4
store i32 %val, ptr %out
@@ -10820,9 +10160,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10837,10 +10174,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10857,10 +10190,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10889,8 +10218,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10903,8 +10230,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11000,7 +10325,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") acquire, align 4
store i32 %val, ptr %out
@@ -11010,9 +10335,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11027,10 +10349,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11049,10 +10367,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11081,8 +10395,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11095,8 +10407,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11202,7 +10512,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) {
+ ptr %in, ptr %out) #0 {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4
store i32 %val, ptr %out
@@ -11212,9 +10522,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11226,10 +10533,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11241,10 +10544,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11267,8 +10566,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11279,8 +10576,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11352,7 +10647,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4
ret void
@@ -11361,9 +10656,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11375,10 +10667,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11390,10 +10678,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11416,8 +10700,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11428,8 +10710,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11501,7 +10781,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4
ret void
@@ -11510,9 +10790,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX7-LABEL: flat_workgroup_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11524,10 +10801,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11541,10 +10814,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11567,8 +10836,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11579,8 +10846,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11660,7 +10925,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4
ret void
@@ -11669,9 +10934,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11683,10 +10945,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11700,10 +10958,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11726,8 +10980,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11738,8 +10990,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11819,7 +11069,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) {
+ i32 %in, ptr %out) #0 {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4
ret void
@@ -11828,9 +11078,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11842,10 +11089,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11857,10 +11100,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11883,8 +11122,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11895,8 +11132,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11968,7 +11203,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic
ret void
@@ -11977,9 +11212,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11991,10 +11223,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12008,10 +11236,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12034,8 +11258,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12046,8 +11268,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12127,7 +11347,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
ret void
@@ -12136,9 +11356,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12150,10 +11367,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12167,10 +11380,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12193,8 +11402,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12205,8 +11412,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12286,7 +11491,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release
ret void
@@ -12295,9 +11500,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12309,10 +11511,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12328,10 +11526,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12354,8 +11548,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12366,8 +11558,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12455,7 +11645,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
ret void
@@ -12464,9 +11654,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12478,10 +11665,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12497,10 +11680,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12523,8 +11702,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12535,8 +11712,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12624,7 +11799,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
ret void
@@ -12633,9 +11808,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12651,10 +11823,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12672,10 +11840,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12706,8 +11870,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12721,8 +11883,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12825,7 +11985,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
store i32 %val, ptr %out, align 4
@@ -12835,9 +11995,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12853,10 +12010,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12876,10 +12029,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12910,8 +12059,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12925,8 +12072,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13039,7 +12184,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
store i32 %val, ptr %out, align 4
@@ -13049,9 +12194,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13067,10 +12209,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13090,10 +12228,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13124,8 +12258,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13139,8 +12271,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13253,7 +12383,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) {
+ ptr %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
store i32 %val, ptr %out, align 4
@@ -13263,9 +12393,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13291,10 +12418,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13320,10 +12443,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13374,8 +12493,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13390,8 +12507,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13491,7 +12606,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
@@ -13501,9 +12616,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13529,10 +12641,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13560,10 +12668,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13614,8 +12718,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13630,8 +12732,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13739,7 +12839,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
@@ -13749,9 +12849,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13777,10 +12874,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13808,10 +12901,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13862,8 +12951,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13878,8 +12965,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13987,7 +13072,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
@@ -13997,9 +13082,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14025,10 +13107,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14058,10 +13136,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14112,8 +13186,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14128,8 +13200,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14245,7 +13315,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
@@ -14255,9 +13325,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14283,10 +13350,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14316,10 +13379,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14370,8 +13429,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14386,8 +13443,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14503,7 +13558,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
@@ -14513,9 +13568,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14541,10 +13593,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14572,10 +13620,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14626,8 +13670,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14642,8 +13684,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14751,7 +13791,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
@@ -14761,9 +13801,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14789,10 +13826,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14820,10 +13853,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14874,8 +13903,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14890,8 +13917,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14999,7 +14024,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
@@ -15009,9 +14034,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15037,10 +14059,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15070,10 +14088,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15124,8 +14138,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15140,8 +14152,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15257,7 +14267,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
@@ -15267,9 +14277,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15295,10 +14302,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15328,10 +14331,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15382,8 +14381,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15398,8 +14395,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15515,7 +14510,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
@@ -15525,9 +14520,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15553,10 +14545,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15586,10 +14574,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15640,8 +14624,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15656,8 +14638,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15773,7 +14753,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
@@ -15783,9 +14763,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15811,10 +14788,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15844,10 +14817,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15898,8 +14867,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15914,8 +14881,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16031,7 +14996,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
@@ -16041,9 +15006,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16069,10 +15031,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16102,10 +15060,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16156,8 +15110,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16172,8 +15124,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16289,7 +15239,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
@@ -16299,9 +15249,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16327,10 +15274,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16360,10 +15303,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16414,8 +15353,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16430,8 +15367,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16547,7 +15482,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
@@ -16557,9 +15492,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16585,10 +15517,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16618,10 +15546,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16672,8 +15596,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16688,8 +15610,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16805,7 +15725,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
@@ -16815,9 +15735,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16843,10 +15760,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16876,10 +15789,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16930,8 +15839,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16946,8 +15853,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17063,7 +15968,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
@@ -17073,9 +15978,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17105,10 +16007,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17138,10 +16036,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17200,8 +16094,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17219,8 +16111,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17345,7 +16235,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
@@ -17357,9 +16247,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17389,10 +16276,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17424,10 +16307,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17486,8 +16365,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17505,8 +16382,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17637,7 +16512,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
@@ -17649,9 +16524,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17681,10 +16553,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17716,10 +16584,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17778,8 +16642,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17797,8 +16659,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17931,7 +16791,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
@@ -17943,9 +16803,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17975,10 +16832,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18012,10 +16865,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18074,8 +16923,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18093,8 +16940,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18235,7 +17080,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
@@ -18247,9 +17092,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18279,10 +17121,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18316,10 +17154,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18378,8 +17212,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18397,8 +17229,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18539,7 +17369,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
@@ -18551,9 +17381,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18583,10 +17410,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18618,10 +17441,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18680,8 +17499,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18699,8 +17516,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18833,7 +17648,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
@@ -18845,9 +17660,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18877,10 +17689,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18912,10 +17720,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18974,8 +17778,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18993,8 +17795,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19125,7 +17925,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
@@ -19137,9 +17937,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19169,10 +17966,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19206,10 +17999,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19268,8 +18057,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19287,8 +18074,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19429,7 +18214,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
@@ -19441,9 +18226,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19473,10 +18255,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19510,10 +18288,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19572,8 +18346,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19591,8 +18363,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19733,7 +18503,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
@@ -19745,9 +18515,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19777,10 +18544,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19814,10 +18577,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19876,8 +18635,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19895,8 +18652,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20037,7 +18792,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
@@ -20049,9 +18804,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20081,10 +18833,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20118,10 +18866,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20180,8 +18924,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20199,8 +18941,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20341,7 +19081,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
@@ -20353,9 +19093,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20385,10 +19122,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20422,10 +19155,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20484,8 +19213,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20503,8 +19230,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20643,7 +19368,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
@@ -20655,9 +19380,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20687,10 +19409,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20724,10 +19442,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20786,8 +19500,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20805,8 +19517,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20947,7 +19657,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
@@ -20959,9 +19669,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20991,10 +19698,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21028,10 +19731,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21090,8 +19789,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21109,8 +19806,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21251,7 +19946,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
@@ -21263,9 +19958,6 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21295,10 +19987,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
-; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21332,10 +20020,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
-; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21394,8 +20078,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21413,8 +20095,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21555,7 +20235,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) {
+ ptr %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
@@ -21563,3 +20243,5 @@ entry:
store i32 %val0, ptr %out, align 4
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 8b600c835a160..9e6226516f0b8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -41,9 +41,6 @@ define amdgpu_kernel void @global_agent_unordered_load(
;
; GFX7-LABEL: global_agent_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -190,7 +187,7 @@ define amdgpu_kernel void @global_agent_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -225,9 +222,6 @@ define amdgpu_kernel void @global_agent_monotonic_load(
;
; GFX7-LABEL: global_agent_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -374,7 +368,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -410,9 +404,6 @@ define amdgpu_kernel void @global_agent_acquire_load(
;
; GFX7-LABEL: global_agent_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -574,7 +565,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -611,9 +602,6 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
;
; GFX7-LABEL: global_agent_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -793,7 +781,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -821,9 +809,6 @@ define amdgpu_kernel void @global_agent_unordered_store(
;
; GFX7-LABEL: global_agent_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -950,7 +935,7 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4
ret void
@@ -977,9 +962,6 @@ define amdgpu_kernel void @global_agent_monotonic_store(
;
; GFX7-LABEL: global_agent_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1106,7 +1088,7 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4
ret void
@@ -1134,9 +1116,6 @@ define amdgpu_kernel void @global_agent_release_store(
;
; GFX7-LABEL: global_agent_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1287,7 +1266,7 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4
ret void
@@ -1315,9 +1294,6 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
;
; GFX7-LABEL: global_agent_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1468,7 +1444,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4
ret void
@@ -1494,9 +1470,6 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
;
; GFX7-LABEL: global_agent_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1622,7 +1595,7 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic
ret void
@@ -1650,9 +1623,6 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
;
; GFX7-LABEL: global_agent_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1805,7 +1775,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
ret void
@@ -1832,9 +1802,6 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
;
; GFX7-LABEL: global_agent_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1984,7 +1951,7 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release
ret void
@@ -2013,9 +1980,6 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_agent_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2192,7 +2156,7 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
ret void
@@ -2221,9 +2185,6 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_agent_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2400,7 +2361,7 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
ret void
@@ -2429,9 +2390,6 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2598,7 +2556,7 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2629,9 +2587,6 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2826,7 +2781,7 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2857,9 +2812,6 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -3054,7 +3006,7 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -3086,9 +3038,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3273,7 +3222,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
@@ -3307,9 +3256,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3521,7 +3467,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
@@ -3554,9 +3500,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3765,7 +3708,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") release monotonic
@@ -3800,9 +3743,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4038,7 +3978,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
@@ -4073,9 +4013,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4311,7 +4248,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
@@ -4345,9 +4282,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4559,7 +4493,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire
@@ -4593,9 +4527,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4807,7 +4738,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
@@ -4842,9 +4773,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5080,7 +5008,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") release acquire
@@ -5115,9 +5043,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5353,7 +5278,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
@@ -5388,9 +5313,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5626,7 +5548,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
@@ -5661,9 +5583,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5899,7 +5818,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst
@@ -5934,9 +5853,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6172,7 +6088,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst
@@ -6207,9 +6123,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6445,7 +6358,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") release seq_cst
@@ -6480,9 +6393,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6718,7 +6628,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst
@@ -6753,9 +6663,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6991,7 +6898,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -7026,9 +6933,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7240,7 +7144,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
@@ -7278,9 +7182,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7508,7 +7409,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
@@ -7546,9 +7447,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7784,7 +7682,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") release monotonic
@@ -7823,9 +7721,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8081,7 +7976,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
@@ -8120,9 +8015,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8378,7 +8270,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
@@ -8416,9 +8308,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8650,7 +8539,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire
@@ -8688,9 +8577,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8918,7 +8804,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
@@ -8957,9 +8843,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9215,7 +9098,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") release acquire
@@ -9254,9 +9137,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9512,7 +9392,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
@@ -9551,9 +9431,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9809,7 +9686,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
@@ -9848,9 +9725,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10106,7 +9980,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst
@@ -10145,9 +10019,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10399,7 +10270,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst
@@ -10438,9 +10309,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10696,7 +10564,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") release seq_cst
@@ -10735,9 +10603,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10993,7 +10858,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst
@@ -11032,9 +10897,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11290,7 +11152,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -11327,9 +11189,6 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
;
; GFX7-LABEL: global_agent_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11476,7 +11335,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -11511,9 +11370,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
;
; GFX7-LABEL: global_agent_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11660,7 +11516,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -11696,9 +11552,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
;
; GFX7-LABEL: global_agent_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11860,7 +11713,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -11897,9 +11750,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12079,7 +11929,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -12107,9 +11957,6 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
;
; GFX7-LABEL: global_agent_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12236,7 +12083,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4
ret void
@@ -12263,9 +12110,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
;
; GFX7-LABEL: global_agent_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12392,7 +12236,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4
ret void
@@ -12420,9 +12264,6 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
;
; GFX7-LABEL: global_agent_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12573,7 +12414,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4
ret void
@@ -12601,9 +12442,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12754,7 +12592,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4
ret void
@@ -12780,9 +12618,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12908,7 +12743,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic
ret void
@@ -12936,9 +12771,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13091,7 +12923,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
ret void
@@ -13118,9 +12950,6 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13270,7 +13099,7 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release
ret void
@@ -13299,9 +13128,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13478,7 +13304,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
ret void
@@ -13507,9 +13333,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13686,7 +13509,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
ret void
@@ -13715,9 +13538,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13884,7 +13704,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -13915,9 +13735,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14112,7 +13929,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -14143,9 +13960,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14340,7 +14154,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -14372,9 +14186,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14559,7 +14370,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
@@ -14593,9 +14404,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14807,7 +14615,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
@@ -14840,9 +14648,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15051,7 +14856,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic
@@ -15086,9 +14891,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15324,7 +15126,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
@@ -15359,9 +15161,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15597,7 +15396,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
@@ -15631,9 +15430,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15845,7 +15641,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire
@@ -15879,9 +15675,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16093,7 +15886,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
@@ -16128,9 +15921,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16366,7 +16156,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
@@ -16401,9 +16191,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16639,7 +16426,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
@@ -16674,9 +16461,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16912,7 +16696,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
@@ -16947,9 +16731,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17185,7 +16966,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst
@@ -17220,9 +17001,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17458,7 +17236,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst
@@ -17493,9 +17271,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17731,7 +17506,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst
@@ -17766,9 +17541,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18004,7 +17776,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst
@@ -18039,9 +17811,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18277,7 +18046,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
@@ -18312,9 +18081,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18526,7 +18292,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
@@ -18564,9 +18330,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18794,7 +18557,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
@@ -18833,9 +18596,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19091,7 +18851,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
@@ -19130,9 +18890,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19388,7 +19145,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
@@ -19426,9 +19183,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19660,7 +19414,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire
@@ -19698,9 +19452,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19928,7 +19679,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
@@ -19967,9 +19718,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20225,7 +19973,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
@@ -20264,9 +20012,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20522,7 +20267,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
@@ -20561,9 +20306,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20819,7 +20561,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
@@ -20858,9 +20600,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21116,7 +20855,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst
@@ -21155,9 +20894,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21409,7 +21145,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst
@@ -21448,9 +21184,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21706,7 +21439,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst
@@ -21745,9 +21478,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22003,7 +21733,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst
@@ -22042,9 +21772,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22300,7 +22027,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
@@ -22308,3 +22035,5 @@ entry:
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 16e55058e4fc8..9afd2b5183efb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -36,9 +36,6 @@ define amdgpu_kernel void @global_nontemporal_load_0(
;
; GFX7-LABEL: global_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -189,7 +186,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load i32, ptr addrspace(1) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
@@ -230,9 +227,6 @@ define amdgpu_kernel void @global_nontemporal_load_1(
;
; GFX7-LABEL: global_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -448,7 +442,7 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
@@ -480,9 +474,6 @@ define amdgpu_kernel void @global_nontemporal_store_0(
;
; GFX7-LABEL: global_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -633,7 +624,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
store i32 %val, ptr addrspace(1) %out, !nontemporal !0
@@ -667,9 +658,6 @@ define amdgpu_kernel void @global_nontemporal_store_1(
;
; GFX7-LABEL: global_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -866,7 +854,7 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr addrspace(1) %in, align 4
@@ -903,9 +891,6 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
;
; GFX7-LABEL: global_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1056,7 +1041,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
@@ -1065,3 +1050,5 @@ entry:
!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 8042d38716107..73958d6e2c3d6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -41,9 +41,6 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
;
; GFX7-LABEL: global_singlethread_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -190,7 +187,7 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -225,9 +222,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
;
; GFX7-LABEL: global_singlethread_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -374,7 +368,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -409,9 +403,6 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
;
; GFX7-LABEL: global_singlethread_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -558,7 +549,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -593,9 +584,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
;
; GFX7-LABEL: global_singlethread_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -742,7 +730,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -770,9 +758,6 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
;
; GFX7-LABEL: global_singlethread_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -899,7 +884,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4
ret void
@@ -926,9 +911,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
;
; GFX7-LABEL: global_singlethread_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1055,7 +1037,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4
ret void
@@ -1082,9 +1064,6 @@ define amdgpu_kernel void @global_singlethread_release_store(
;
; GFX7-LABEL: global_singlethread_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1211,7 +1190,7 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4
ret void
@@ -1238,9 +1217,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
;
; GFX7-LABEL: global_singlethread_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1367,7 +1343,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4
ret void
@@ -1393,9 +1369,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
;
; GFX7-LABEL: global_singlethread_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1521,7 +1494,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic
ret void
@@ -1547,9 +1520,6 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1675,7 +1645,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
ret void
@@ -1701,9 +1671,6 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
;
; GFX7-LABEL: global_singlethread_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1829,7 +1796,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release
ret void
@@ -1855,9 +1822,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1983,7 +1947,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
ret void
@@ -2009,9 +1973,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2137,7 +2098,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
ret void
@@ -2165,9 +2126,6 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2319,7 +2277,7 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2348,9 +2306,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2502,7 +2457,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2531,9 +2486,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2685,7 +2637,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2717,9 +2669,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2904,7 +2853,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
@@ -2936,9 +2885,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3123,7 +3069,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
@@ -3155,9 +3101,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3342,7 +3285,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
@@ -3374,9 +3317,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3561,7 +3501,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
@@ -3593,9 +3533,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3780,7 +3717,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
@@ -3812,9 +3749,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3999,7 +3933,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
@@ -4031,9 +3965,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4218,7 +4149,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
@@ -4250,9 +4181,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4437,7 +4365,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
@@ -4469,9 +4397,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4656,7 +4581,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
@@ -4688,9 +4613,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4875,7 +4797,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
@@ -4907,9 +4829,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5094,7 +5013,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
@@ -5126,9 +5045,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5313,7 +5229,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
@@ -5345,9 +5261,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5532,7 +5445,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
@@ -5564,9 +5477,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5751,7 +5661,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
@@ -5783,9 +5693,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5970,7 +5877,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
@@ -6005,9 +5912,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6219,7 +6123,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
@@ -6256,9 +6160,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6470,7 +6371,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
@@ -6507,9 +6408,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6721,7 +6619,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
@@ -6758,9 +6656,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6972,7 +6867,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
@@ -7009,9 +6904,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7223,7 +7115,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
@@ -7260,9 +7152,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7474,7 +7363,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
@@ -7511,9 +7400,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7725,7 +7611,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
@@ -7762,9 +7648,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7976,7 +7859,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
@@ -8013,9 +7896,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8227,7 +8107,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
@@ -8264,9 +8144,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8478,7 +8355,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
@@ -8515,9 +8392,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8729,7 +8603,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
@@ -8766,9 +8640,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8980,7 +8851,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
@@ -9017,9 +8888,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9231,7 +9099,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
@@ -9268,9 +9136,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9482,7 +9347,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
@@ -9519,9 +9384,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9733,7 +9595,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
@@ -9770,9 +9632,6 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
;
; GFX7-LABEL: global_singlethread_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9919,7 +9778,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -9954,9 +9813,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10103,7 +9959,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10138,9 +9994,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10287,7 +10140,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10322,9 +10175,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10471,7 +10321,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10499,9 +10349,6 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
;
; GFX7-LABEL: global_singlethread_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10628,7 +10475,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4
ret void
@@ -10655,9 +10502,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10784,7 +10628,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4
ret void
@@ -10811,9 +10655,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
;
; GFX7-LABEL: global_singlethread_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10940,7 +10781,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4
ret void
@@ -10967,9 +10808,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11096,7 +10934,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4
ret void
@@ -11122,9 +10960,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11250,7 +11085,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic
ret void
@@ -11276,9 +11111,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11404,7 +11236,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
ret void
@@ -11430,9 +11262,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11558,7 +11387,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release
ret void
@@ -11584,9 +11413,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11712,7 +11538,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
ret void
@@ -11738,9 +11564,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11866,7 +11689,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
ret void
@@ -11894,9 +11717,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12048,7 +11868,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12077,9 +11897,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12231,7 +12048,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12260,9 +12077,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12414,7 +12228,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12446,9 +12260,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12633,7 +12444,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
@@ -12665,9 +12476,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12852,7 +12660,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
@@ -12884,9 +12692,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13071,7 +12876,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
@@ -13103,9 +12908,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13290,7 +13092,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
@@ -13322,9 +13124,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13509,7 +13308,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
@@ -13541,9 +13340,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13728,7 +13524,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
@@ -13760,9 +13556,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13947,7 +13740,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
@@ -13979,9 +13772,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14166,7 +13956,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
@@ -14198,9 +13988,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14385,7 +14172,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
@@ -14417,9 +14204,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14604,7 +14388,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
@@ -14636,9 +14420,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14823,7 +14604,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
@@ -14855,9 +14636,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15042,7 +14820,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
@@ -15074,9 +14852,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15261,7 +15036,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
@@ -15293,9 +15068,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15480,7 +15252,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
@@ -15512,9 +15284,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15699,7 +15468,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
@@ -15734,9 +15503,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15948,7 +15714,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
@@ -15985,9 +15751,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16199,7 +15962,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
@@ -16236,9 +15999,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16450,7 +16210,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
@@ -16487,9 +16247,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16701,7 +16458,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
@@ -16738,9 +16495,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16952,7 +16706,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
@@ -16989,9 +16743,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17203,7 +16954,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
@@ -17240,9 +16991,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17454,7 +17202,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
@@ -17491,9 +17239,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17705,7 +17450,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
@@ -17742,9 +17487,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17956,7 +17698,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
@@ -17993,9 +17735,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18207,7 +17946,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
@@ -18244,9 +17983,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18458,7 +18194,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
@@ -18495,9 +18231,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18709,7 +18442,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
@@ -18746,9 +18479,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18960,7 +18690,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
@@ -18997,9 +18727,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19211,7 +18938,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
@@ -19248,9 +18975,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19462,7 +19186,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
@@ -19470,3 +19194,5 @@ entry:
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 9c11781da56f2..7d98eeaad7998 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -41,9 +41,6 @@ define amdgpu_kernel void @global_system_unordered_load(
;
; GFX7-LABEL: global_system_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -190,7 +187,7 @@ define amdgpu_kernel void @global_system_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -225,9 +222,6 @@ define amdgpu_kernel void @global_system_monotonic_load(
;
; GFX7-LABEL: global_system_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -374,7 +368,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -410,9 +404,6 @@ define amdgpu_kernel void @global_system_acquire_load(
;
; GFX7-LABEL: global_system_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -576,7 +567,7 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -613,9 +604,6 @@ define amdgpu_kernel void @global_system_seq_cst_load(
;
; GFX7-LABEL: global_system_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -797,7 +785,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -825,9 +813,6 @@ define amdgpu_kernel void @global_system_unordered_store(
;
; GFX7-LABEL: global_system_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -954,7 +939,7 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out unordered, align 4
ret void
@@ -981,9 +966,6 @@ define amdgpu_kernel void @global_system_monotonic_store(
;
; GFX7-LABEL: global_system_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1110,7 +1092,7 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4
ret void
@@ -1138,9 +1120,6 @@ define amdgpu_kernel void @global_system_release_store(
;
; GFX7-LABEL: global_system_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1295,7 +1274,7 @@ define amdgpu_kernel void @global_system_release_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out release, align 4
ret void
@@ -1323,9 +1302,6 @@ define amdgpu_kernel void @global_system_seq_cst_store(
;
; GFX7-LABEL: global_system_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1480,7 +1456,7 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4
ret void
@@ -1506,9 +1482,6 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
;
; GFX7-LABEL: global_system_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1634,7 +1607,7 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic
ret void
@@ -1662,9 +1635,6 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
;
; GFX7-LABEL: global_system_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1819,7 +1789,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
ret void
@@ -1846,9 +1816,6 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
;
; GFX7-LABEL: global_system_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2002,7 +1969,7 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release
ret void
@@ -2031,9 +1998,6 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_system_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2216,7 +2180,7 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
ret void
@@ -2245,9 +2209,6 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_system_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2430,7 +2391,7 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
ret void
@@ -2459,9 +2420,6 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2630,7 +2588,7 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2661,9 +2619,6 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2864,7 +2819,7 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2895,9 +2850,6 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -3098,7 +3050,7 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -3130,9 +3082,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3317,7 +3266,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic monotonic
@@ -3351,9 +3300,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3567,7 +3513,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire monotonic
@@ -3600,9 +3546,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3815,7 +3758,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in release monotonic
@@ -3850,9 +3793,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4094,7 +4034,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel monotonic
@@ -4129,9 +4069,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4373,7 +4310,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst monotonic
@@ -4407,9 +4344,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4623,7 +4557,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic acquire
@@ -4657,9 +4591,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4873,7 +4804,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire acquire
@@ -4908,9 +4839,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5152,7 +5080,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in release acquire
@@ -5187,9 +5115,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5431,7 +5356,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel acquire
@@ -5466,9 +5391,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5710,7 +5632,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst acquire
@@ -5745,9 +5667,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5989,7 +5908,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -6024,9 +5943,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6238,7 +6154,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic monotonic
@@ -6276,9 +6192,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6508,7 +6421,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire monotonic
@@ -6547,9 +6460,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6811,7 +6721,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel monotonic
@@ -6850,9 +6760,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7114,7 +7021,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst monotonic
@@ -7152,9 +7059,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7388,7 +7292,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic acquire
@@ -7426,9 +7330,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7658,7 +7559,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire acquire
@@ -7697,9 +7598,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7961,7 +7859,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in release acquire
@@ -8000,9 +7898,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8264,7 +8159,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel acquire
@@ -8303,9 +8198,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8567,7 +8459,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst acquire
@@ -8606,9 +8498,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8870,7 +8759,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic seq_cst
@@ -8909,9 +8798,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9169,7 +9055,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire seq_cst
@@ -9208,9 +9094,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9472,7 +9355,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in release seq_cst
@@ -9511,9 +9394,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9775,7 +9655,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel seq_cst
@@ -9814,9 +9694,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10078,7 +9955,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -10115,9 +9992,6 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
;
; GFX7-LABEL: global_system_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10264,7 +10138,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10299,9 +10173,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
;
; GFX7-LABEL: global_system_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10448,7 +10319,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10484,9 +10355,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
;
; GFX7-LABEL: global_system_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10650,7 +10518,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10687,9 +10555,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
;
; GFX7-LABEL: global_system_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10871,7 +10736,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10899,9 +10764,6 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
;
; GFX7-LABEL: global_system_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11028,7 +10890,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4
ret void
@@ -11055,9 +10917,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
;
; GFX7-LABEL: global_system_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11184,7 +11043,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4
ret void
@@ -11212,9 +11071,6 @@ define amdgpu_kernel void @global_system_one_as_release_store(
;
; GFX7-LABEL: global_system_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11369,7 +11225,7 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4
ret void
@@ -11397,9 +11253,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
;
; GFX7-LABEL: global_system_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11554,7 +11407,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4
ret void
@@ -11580,9 +11433,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11708,7 +11558,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic
ret void
@@ -11736,9 +11586,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11893,7 +11740,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
ret void
@@ -11920,9 +11767,6 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12076,7 +11920,7 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release
ret void
@@ -12105,9 +11949,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12290,7 +12131,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
ret void
@@ -12319,9 +12160,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12504,7 +12342,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
ret void
@@ -12533,9 +12371,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12704,7 +12539,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12735,9 +12570,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12938,7 +12770,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12969,9 +12801,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13172,7 +13001,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -13204,9 +13033,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13391,7 +13217,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
@@ -13425,9 +13251,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13641,7 +13464,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
@@ -13674,9 +13497,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13889,7 +13709,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
@@ -13924,9 +13744,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14168,7 +13985,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
@@ -14203,9 +14020,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14447,7 +14261,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
@@ -14481,9 +14295,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14697,7 +14508,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
@@ -14731,9 +14542,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14947,7 +14755,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
@@ -14982,9 +14790,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15226,7 +15031,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release acquire
@@ -15261,9 +15066,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15505,7 +15307,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
@@ -15540,9 +15342,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15784,7 +15583,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
@@ -15819,9 +15618,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16063,7 +15859,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
@@ -16098,9 +15894,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16342,7 +16135,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
@@ -16377,9 +16170,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16621,7 +16411,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
@@ -16656,9 +16446,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16900,7 +16687,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
@@ -16935,9 +16722,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17179,7 +16963,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
@@ -17214,9 +16998,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17428,7 +17209,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
@@ -17466,9 +17247,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17698,7 +17476,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
@@ -17736,9 +17514,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17978,7 +17753,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
@@ -18017,9 +17792,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18281,7 +18053,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
@@ -18320,9 +18092,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18584,7 +18353,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
@@ -18622,9 +18391,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18858,7 +18624,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
@@ -18896,9 +18662,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19128,7 +18891,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
@@ -19167,9 +18930,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19431,7 +19191,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release acquire
@@ -19470,9 +19230,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19734,7 +19491,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
@@ -19773,9 +19530,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20037,7 +19791,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
@@ -20076,9 +19830,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20340,7 +20091,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
@@ -20379,9 +20130,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20639,7 +20387,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
@@ -20678,9 +20426,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20942,7 +20687,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
@@ -20981,9 +20726,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21245,7 +20987,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
@@ -21284,9 +21026,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21548,7 +21287,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
@@ -21556,3 +21295,5 @@ entry:
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 8a5c5dda9f79c..66a8a9a0ac569 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -37,9 +37,6 @@ define amdgpu_kernel void @global_volatile_load_0(
;
; GFX7-LABEL: global_volatile_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -146,7 +143,7 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4
store i32 %val, ptr addrspace(1) %out
@@ -187,9 +184,6 @@ define amdgpu_kernel void @global_volatile_load_1(
;
; GFX7-LABEL: global_volatile_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -345,7 +339,7 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
@@ -378,9 +372,6 @@ define amdgpu_kernel void @global_volatile_store_0(
;
; GFX7-LABEL: global_volatile_store_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -501,7 +492,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
store volatile i32 %val, ptr addrspace(1) %out
@@ -536,9 +527,6 @@ define amdgpu_kernel void @global_volatile_store_1(
;
; GFX7-LABEL: global_volatile_store_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -693,7 +681,7 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr addrspace(1) %in, align 4
@@ -730,9 +718,6 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
;
; GFX7-LABEL: global_volatile_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -838,7 +823,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic volatile i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -867,9 +852,6 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
;
; GFX7-LABEL: global_volatile_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -969,10 +951,11 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index 151ba07a0b531..ecd584fd00e3b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -41,9 +41,6 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
;
; GFX7-LABEL: global_wavefront_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -190,7 +187,7 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -225,9 +222,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
;
; GFX7-LABEL: global_wavefront_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -374,7 +368,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -409,9 +403,6 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
;
; GFX7-LABEL: global_wavefront_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -558,7 +549,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -593,9 +584,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
;
; GFX7-LABEL: global_wavefront_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -742,7 +730,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -770,9 +758,6 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
;
; GFX7-LABEL: global_wavefront_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -899,7 +884,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4
ret void
@@ -926,9 +911,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
;
; GFX7-LABEL: global_wavefront_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1055,7 +1037,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4
ret void
@@ -1082,9 +1064,6 @@ define amdgpu_kernel void @global_wavefront_release_store(
;
; GFX7-LABEL: global_wavefront_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1211,7 +1190,7 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4
ret void
@@ -1238,9 +1217,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
;
; GFX7-LABEL: global_wavefront_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1367,7 +1343,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4
ret void
@@ -1393,9 +1369,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
;
; GFX7-LABEL: global_wavefront_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1521,7 +1494,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic
ret void
@@ -1547,9 +1520,6 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1675,7 +1645,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
ret void
@@ -1701,9 +1671,6 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
;
; GFX7-LABEL: global_wavefront_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1829,7 +1796,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release
ret void
@@ -1855,9 +1822,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1983,7 +1947,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
ret void
@@ -2009,9 +1973,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2137,7 +2098,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
ret void
@@ -2165,9 +2126,6 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2319,7 +2277,7 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2348,9 +2306,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2502,7 +2457,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2531,9 +2486,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2685,7 +2637,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2717,9 +2669,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2904,7 +2853,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
@@ -2936,9 +2885,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3123,7 +3069,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
@@ -3155,9 +3101,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3342,7 +3285,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
@@ -3374,9 +3317,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3561,7 +3501,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
@@ -3593,9 +3533,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3780,7 +3717,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
@@ -3812,9 +3749,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3999,7 +3933,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
@@ -4031,9 +3965,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4218,7 +4149,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
@@ -4250,9 +4181,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4437,7 +4365,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
@@ -4469,9 +4397,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4656,7 +4581,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
@@ -4688,9 +4613,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4875,7 +4797,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
@@ -4907,9 +4829,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5094,7 +5013,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
@@ -5126,9 +5045,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5313,7 +5229,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
@@ -5345,9 +5261,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5532,7 +5445,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
@@ -5564,9 +5477,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5751,7 +5661,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
@@ -5783,9 +5693,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5970,7 +5877,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
@@ -6005,9 +5912,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6219,7 +6123,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
@@ -6256,9 +6160,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6470,7 +6371,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
@@ -6507,9 +6408,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6721,7 +6619,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
@@ -6758,9 +6656,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6972,7 +6867,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
@@ -7009,9 +6904,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7223,7 +7115,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
@@ -7260,9 +7152,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7474,7 +7363,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
@@ -7511,9 +7400,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7725,7 +7611,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
@@ -7762,9 +7648,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7976,7 +7859,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
@@ -8013,9 +7896,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8227,7 +8107,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
@@ -8264,9 +8144,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8478,7 +8355,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
@@ -8515,9 +8392,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8729,7 +8603,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
@@ -8766,9 +8640,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8980,7 +8851,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
@@ -9017,9 +8888,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9231,7 +9099,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
@@ -9268,9 +9136,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9482,7 +9347,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
@@ -9519,9 +9384,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9733,7 +9595,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
@@ -9770,9 +9632,6 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
;
; GFX7-LABEL: global_wavefront_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9919,7 +9778,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -9954,9 +9813,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10103,7 +9959,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10138,9 +9994,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10287,7 +10140,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10322,9 +10175,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10471,7 +10321,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10499,9 +10349,6 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
;
; GFX7-LABEL: global_wavefront_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10628,7 +10475,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4
ret void
@@ -10655,9 +10502,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10784,7 +10628,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4
ret void
@@ -10811,9 +10655,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
;
; GFX7-LABEL: global_wavefront_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10940,7 +10781,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4
ret void
@@ -10967,9 +10808,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11096,7 +10934,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4
ret void
@@ -11122,9 +10960,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11250,7 +11085,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic
ret void
@@ -11276,9 +11111,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11404,7 +11236,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
ret void
@@ -11430,9 +11262,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11558,7 +11387,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release
ret void
@@ -11584,9 +11413,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11712,7 +11538,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
ret void
@@ -11738,9 +11564,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11866,7 +11689,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
ret void
@@ -11894,9 +11717,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12048,7 +11868,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12077,9 +11897,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12231,7 +12048,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12260,9 +12077,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12414,7 +12228,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12446,9 +12260,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12633,7 +12444,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
@@ -12665,9 +12476,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12852,7 +12660,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
@@ -12884,9 +12692,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13071,7 +12876,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
@@ -13103,9 +12908,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13290,7 +13092,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
@@ -13322,9 +13124,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13509,7 +13308,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
@@ -13541,9 +13340,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13728,7 +13524,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
@@ -13760,9 +13556,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13947,7 +13740,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
@@ -13979,9 +13772,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14166,7 +13956,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
@@ -14198,9 +13988,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14385,7 +14172,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
@@ -14417,9 +14204,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14604,7 +14388,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
@@ -14636,9 +14420,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14823,7 +14604,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
@@ -14855,9 +14636,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15042,7 +14820,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
@@ -15074,9 +14852,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15261,7 +15036,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
@@ -15293,9 +15068,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15480,7 +15252,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
@@ -15512,9 +15284,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15699,7 +15468,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
@@ -15734,9 +15503,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15948,7 +15714,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
@@ -15985,9 +15751,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16199,7 +15962,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
@@ -16236,9 +15999,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16450,7 +16210,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
@@ -16487,9 +16247,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16701,7 +16458,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
@@ -16738,9 +16495,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16952,7 +16706,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
@@ -16989,9 +16743,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17203,7 +16954,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
@@ -17240,9 +16991,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17454,7 +17202,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
@@ -17491,9 +17239,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17705,7 +17450,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
@@ -17742,9 +17487,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17956,7 +17698,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
@@ -17993,9 +17735,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18207,7 +17946,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
@@ -18244,9 +17983,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18458,7 +18194,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
@@ -18495,9 +18231,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18709,7 +18442,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
@@ -18746,9 +18479,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18960,7 +18690,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
@@ -18997,9 +18727,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19211,7 +18938,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
@@ -19248,9 +18975,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19462,7 +19186,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
@@ -19470,3 +19194,5 @@ entry:
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 69b0c7f93ab0e..e32256a85a809 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -41,9 +41,6 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
;
; GFX7-LABEL: global_workgroup_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -190,7 +187,7 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -225,9 +222,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
;
; GFX7-LABEL: global_workgroup_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -374,7 +368,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -409,9 +403,6 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
;
; GFX7-LABEL: global_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -563,7 +554,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -599,9 +590,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
;
; GFX7-LABEL: global_workgroup_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -764,7 +752,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -792,9 +780,6 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
;
; GFX7-LABEL: global_workgroup_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -921,7 +906,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4
ret void
@@ -948,9 +933,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
;
; GFX7-LABEL: global_workgroup_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1077,7 +1059,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4
ret void
@@ -1105,9 +1087,6 @@ define amdgpu_kernel void @global_workgroup_release_store(
;
; GFX7-LABEL: global_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1251,7 +1230,7 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
ret void
@@ -1279,9 +1258,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
;
; GFX7-LABEL: global_workgroup_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1425,7 +1401,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4
ret void
@@ -1451,9 +1427,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
;
; GFX7-LABEL: global_workgroup_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1579,7 +1552,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic
ret void
@@ -1605,9 +1578,6 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1743,7 +1713,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
ret void
@@ -1770,9 +1740,6 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
;
; GFX7-LABEL: global_workgroup_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1915,7 +1882,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release
ret void
@@ -1942,9 +1909,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2097,7 +2061,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
ret void
@@ -2124,9 +2088,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2279,7 +2240,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
ret void
@@ -2307,9 +2268,6 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2466,7 +2424,7 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2496,9 +2454,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2674,7 +2629,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2704,9 +2659,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2882,7 +2834,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2914,9 +2866,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3101,7 +3050,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
@@ -3133,9 +3082,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3330,7 +3276,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
@@ -3363,9 +3309,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3567,7 +3510,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
@@ -3600,9 +3543,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3814,7 +3754,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
@@ -3847,9 +3787,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4061,7 +3998,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
@@ -4093,9 +4030,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4290,7 +4224,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
@@ -4322,9 +4256,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4519,7 +4450,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
@@ -4552,9 +4483,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4766,7 +4694,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
@@ -4799,9 +4727,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5013,7 +4938,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
@@ -5046,9 +4971,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5260,7 +5182,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
@@ -5293,9 +5215,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5507,7 +5426,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst
@@ -5540,9 +5459,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5754,7 +5670,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst
@@ -5787,9 +5703,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6001,7 +5914,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst
@@ -6034,9 +5947,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6248,7 +6158,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst
@@ -6281,9 +6191,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6495,7 +6402,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
@@ -6530,9 +6437,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6744,7 +6648,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
@@ -6781,9 +6685,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7000,7 +6901,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
@@ -7038,9 +6939,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7269,7 +7167,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
@@ -7307,9 +7205,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7545,7 +7440,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
@@ -7583,9 +7478,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7821,7 +7713,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
@@ -7858,9 +7750,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8079,7 +7968,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
@@ -8116,9 +8005,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8335,7 +8221,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
@@ -8373,9 +8259,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8611,7 +8494,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
@@ -8649,9 +8532,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8887,7 +8767,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
@@ -8925,9 +8805,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9163,7 +9040,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
@@ -9201,9 +9078,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9439,7 +9313,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst
@@ -9477,9 +9351,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9713,7 +9584,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst
@@ -9751,9 +9622,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9989,7 +9857,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst
@@ -10027,9 +9895,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10265,7 +10130,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst
@@ -10303,9 +10168,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10541,7 +10403,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
@@ -10578,9 +10440,6 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
;
; GFX7-LABEL: global_workgroup_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10727,7 +10586,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10762,9 +10621,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10911,7 +10767,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10946,9 +10802,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11100,7 +10953,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -11135,9 +10988,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11297,7 +11147,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -11325,9 +11175,6 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
;
; GFX7-LABEL: global_workgroup_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11454,7 +11301,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4
ret void
@@ -11481,9 +11328,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11610,7 +11454,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4
ret void
@@ -11637,9 +11481,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
;
; GFX7-LABEL: global_workgroup_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11776,7 +11617,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4
ret void
@@ -11803,9 +11644,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11942,7 +11780,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) {
+ i32 %in, ptr addrspace(1) %out) #0 {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4
ret void
@@ -11968,9 +11806,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12096,7 +11931,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic
ret void
@@ -12122,9 +11957,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12260,7 +12092,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
ret void
@@ -12286,9 +12118,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12424,7 +12253,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release
ret void
@@ -12450,9 +12279,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12598,7 +12424,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
ret void
@@ -12624,9 +12450,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12772,7 +12595,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
ret void
@@ -12800,9 +12623,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12959,7 +12779,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12988,9 +12808,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13159,7 +12976,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -13188,9 +13005,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13359,7 +13173,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) {
+ ptr addrspace(1) %out, i32 %in) #0 {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -13391,9 +13205,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13578,7 +13389,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
@@ -13610,9 +13421,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13807,7 +13615,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
@@ -13839,9 +13647,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14036,7 +13841,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
@@ -14068,9 +13873,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14275,7 +14077,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
@@ -14307,9 +14109,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14514,7 +14313,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
@@ -14546,9 +14345,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14743,7 +14539,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
@@ -14775,9 +14571,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14972,7 +14765,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
@@ -15004,9 +14797,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15211,7 +15001,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
@@ -15243,9 +15033,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15450,7 +15237,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
@@ -15482,9 +15269,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15689,7 +15473,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
@@ -15721,9 +15505,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15928,7 +15709,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
@@ -15960,9 +15741,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16167,7 +15945,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
@@ -16199,9 +15977,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16406,7 +16181,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
@@ -16438,9 +16213,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16645,7 +16417,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
@@ -16677,9 +16449,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16884,7 +16653,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
@@ -16919,9 +16688,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17133,7 +16899,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
@@ -17170,9 +16936,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17389,7 +17152,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
@@ -17426,9 +17189,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17650,7 +17410,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
@@ -17687,9 +17447,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17918,7 +17675,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
@@ -17955,9 +17712,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18186,7 +17940,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
@@ -18223,9 +17977,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18444,7 +18195,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
@@ -18481,9 +18232,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18700,7 +18448,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
@@ -18737,9 +18485,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18968,7 +18713,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
@@ -19005,9 +18750,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19236,7 +18978,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
@@ -19273,9 +19015,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19504,7 +19243,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
@@ -19541,9 +19280,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19772,7 +19508,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
@@ -19809,9 +19545,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20038,7 +19771,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
@@ -20075,9 +19808,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20306,7 +20036,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
@@ -20343,9 +20073,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20574,7 +20301,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
@@ -20611,9 +20338,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20842,7 +20566,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) {
+ ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
@@ -20850,3 +20574,5 @@ entry:
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 78209ee34cad4..7850b4dfd0ca0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -38,9 +38,6 @@ define amdgpu_kernel void @local_nontemporal_load_0(
;
; GFX7-LABEL: local_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
@@ -193,7 +190,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(3) %in, ptr addrspace(1) %out) {
+ ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load i32, ptr addrspace(3) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
@@ -227,9 +224,6 @@ define amdgpu_kernel void @local_nontemporal_load_1(
;
; GFX7-LABEL: local_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 s7, 2
@@ -428,7 +422,7 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(3) %in, ptr addrspace(1) %out) {
+ ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(3) %in, i32 %tid
@@ -597,7 +591,7 @@ define amdgpu_kernel void @local_nontemporal_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(3) %out) {
+ ptr addrspace(1) %in, ptr addrspace(3) %out) #0 {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
store i32 %val, ptr addrspace(3) %out, !nontemporal !0
@@ -802,7 +796,7 @@ define amdgpu_kernel void @local_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(3) %out) {
+ ptr addrspace(1) %in, ptr addrspace(3) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr addrspace(1) %in, align 4
@@ -836,9 +830,6 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
;
; GFX7-LABEL: local_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
@@ -991,7 +982,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(3) %in, ptr addrspace(1) %out) {
+ ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load volatile i32, ptr addrspace(3) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
@@ -1000,3 +991,4 @@ entry:
!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index bc2508411ed6b..39293f6b267a8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -34,9 +34,6 @@ define amdgpu_kernel void @local_volatile_load_0(
;
; GFX7-LABEL: local_volatile_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
@@ -141,7 +138,7 @@ define amdgpu_kernel void @local_volatile_load_0(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(3) %in, ptr addrspace(1) %out) {
+ ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load volatile i32, ptr addrspace(3) %in, align 4
store i32 %val, ptr addrspace(1) %out
@@ -175,9 +172,6 @@ define amdgpu_kernel void @local_volatile_load_1(
;
; GFX7-LABEL: local_volatile_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 s7, 2
@@ -308,7 +302,7 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(3) %in, ptr addrspace(1) %out) {
+ ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(3) %in, i32 %tid
@@ -439,7 +433,7 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(3) %out) {
+ ptr addrspace(1) %in, ptr addrspace(3) %out) #0 {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
store volatile i32 %val, ptr addrspace(3) %out
@@ -590,7 +584,7 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(3) %out) {
+ ptr addrspace(1) %in, ptr addrspace(3) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr addrspace(1) %in, align 4
@@ -718,7 +712,7 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(3) %in, ptr addrspace(3) %out) {
+ ptr addrspace(3) %in, ptr addrspace(3) %out) #0 {
entry:
%val = load atomic volatile i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4
store i32 %val, ptr addrspace(3) %out
@@ -833,10 +827,11 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(3) %out) {
+ i32 %in, ptr addrspace(3) %out) #0 {
entry:
store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 2aa4f021c259c..29dfce7b682de 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -38,10 +38,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX7-LABEL: private_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_add_u32 s0, s0, s15
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -56,7 +53,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX10-WGP-LABEL: private_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -70,7 +67,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX10-CU-LABEL: private_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -110,7 +107,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -124,7 +121,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -201,7 +198,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(5) %in, ptr addrspace(1) %out) {
+ ptr addrspace(5) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load i32, ptr addrspace(5) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
@@ -235,10 +232,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX7-LABEL: private_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_add_u32 s0, s0, s15
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -255,7 +249,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX10-WGP-LABEL: private_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -271,7 +265,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX10-CU-LABEL: private_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -315,7 +309,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -334,7 +328,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -450,7 +444,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(5) %in, ptr addrspace(1) %out) {
+ ptr addrspace(5) %in, ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid
@@ -476,7 +470,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX7-LABEL: private_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_add_u32 s0, s0, s15
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
@@ -490,7 +484,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX10-WGP-LABEL: private_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -504,7 +498,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX10-CU-LABEL: private_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -536,7 +530,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -550,7 +544,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -627,7 +621,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(5) %out) {
+ ptr addrspace(1) %in, ptr addrspace(5) %out) #0 {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
store i32 %val, ptr addrspace(5) %out, !nontemporal !0
@@ -653,7 +647,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX7-LABEL: private_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_add_u32 s0, s0, s15
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
@@ -669,7 +663,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX10-WGP-LABEL: private_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -684,7 +678,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX10-CU-LABEL: private_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -719,7 +713,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -737,7 +731,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -846,7 +840,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: scratch_store_b32 v1, v0, s0 th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(5) %out) {
+ ptr addrspace(1) %in, ptr addrspace(5) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr addrspace(1) %in, align 4
@@ -880,10 +874,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX7-LABEL: private_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_add_u32 s0, s0, s15
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -898,7 +889,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -912,7 +903,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX10-CU-LABEL: private_nontemporal_volatile_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -952,7 +943,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -966,7 +957,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -1047,7 +1038,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(5) %in, ptr addrspace(1) %out) {
+ ptr addrspace(5) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load volatile i32, ptr addrspace(5) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
@@ -1056,3 +1047,4 @@ entry:
!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index df4193969f8a0..77a93f2156543 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -38,10 +38,7 @@ define amdgpu_kernel void @private_volatile_load_0(
;
; GFX7-LABEL: private_volatile_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_add_u32 s0, s0, s15
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -56,7 +53,7 @@ define amdgpu_kernel void @private_volatile_load_0(
;
; GFX10-WGP-LABEL: private_volatile_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -70,7 +67,7 @@ define amdgpu_kernel void @private_volatile_load_0(
;
; GFX10-CU-LABEL: private_volatile_load_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -155,7 +152,7 @@ define amdgpu_kernel void @private_volatile_load_0(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(5) %in, ptr addrspace(1) %out) {
+ ptr addrspace(5) %in, ptr addrspace(1) %out) #0 {
entry:
%val = load volatile i32, ptr addrspace(5) %in, align 4
store i32 %val, ptr addrspace(1) %out
@@ -193,10 +190,7 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX7-LABEL: private_volatile_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX7-NEXT: s_add_i32 s12, s12, s17
-; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_add_u32 s0, s0, s15
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -213,7 +207,7 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX10-WGP-LABEL: private_volatile_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -229,7 +223,7 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX10-CU-LABEL: private_volatile_load_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -340,7 +334,7 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(5) %in, ptr addrspace(1) %out) {
+ ptr addrspace(5) %in, ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid
@@ -371,7 +365,7 @@ define amdgpu_kernel void @private_volatile_store_0(
;
; GFX7-LABEL: private_volatile_store_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_add_u32 s0, s0, s15
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
@@ -386,7 +380,7 @@ define amdgpu_kernel void @private_volatile_store_0(
;
; GFX10-WGP-LABEL: private_volatile_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -401,7 +395,7 @@ define amdgpu_kernel void @private_volatile_store_0(
;
; GFX10-CU-LABEL: private_volatile_store_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -490,7 +484,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(5) %out) {
+ ptr addrspace(1) %in, ptr addrspace(5) %out) #0 {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
store volatile i32 %val, ptr addrspace(5) %out
@@ -521,7 +515,7 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX7-LABEL: private_volatile_store_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_add_u32 s0, s0, s15
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
@@ -538,7 +532,7 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX10-WGP-LABEL: private_volatile_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -554,7 +548,7 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX10-CU-LABEL: private_volatile_store_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -664,7 +658,7 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX12-CU-NEXT: scratch_store_b32 v1, v0, s0 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(5) %out) {
+ ptr addrspace(1) %in, ptr addrspace(5) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr addrspace(1) %in, align 4
@@ -674,3 +668,4 @@ entry:
}
declare i32 @llvm.amdgcn.workitem.id.x()
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 07072f6a36296..aa562d7328824 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -34,13 +34,10 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -59,13 +56,10 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -150,9 +144,6 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_imin_sle_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -164,9 +155,6 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_imin_sle_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -226,9 +214,6 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
; CI-LABEL: s_test_imin_sle_v1i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -240,9 +225,6 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
; VI-LABEL: s_test_imin_sle_v1i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -306,9 +288,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s3, s3, s7
; CI-NEXT: s_min_i32 s2, s2, s6
@@ -327,9 +306,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s3, s3, s7
; VI-NEXT: s_min_i32 s2, s2, s6
@@ -438,14 +414,11 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i8 s2, s2
; CI-NEXT: s_sext_i32_i8 s3, s3
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_byte v[0:1], v2
@@ -456,14 +429,11 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i8 s2, s2
; VI-NEXT: s_sext_i32_i8 s3, s3
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -579,8 +549,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s4, s2, 24
; CI-NEXT: s_sext_i32_i8 s5, s2
@@ -604,7 +572,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; CI-NEXT: s_and_b32 s3, s3, 0xffff
; CI-NEXT: s_or_b32 s2, s3, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -615,8 +582,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s4, s2, 24
; VI-NEXT: s_bfe_i32 s5, s2, 0x80010
@@ -640,7 +605,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_or_b32 s2, s2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -793,9 +757,6 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
; CI-LABEL: s_test_imin_sle_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s4, s2, 16
; CI-NEXT: s_sext_i32_i16 s2, s2
@@ -815,9 +776,6 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
; VI-LABEL: s_test_imin_sle_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s4, s2, 16
; VI-NEXT: s_sext_i32_i16 s2, s2
@@ -899,9 +857,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s6, s0, 16
; CI-NEXT: s_ashr_i32 s7, s1, 16
@@ -932,9 +887,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s6, s1, 16
; VI-NEXT: s_sext_i32_i16 s1, s1
@@ -1031,13 +983,10 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1056,13 +1005,10 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1176,13 +1122,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1201,13 +1144,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1293,9 +1233,6 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_imin_slt_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1307,9 +1244,6 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_imin_slt_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1371,9 +1305,6 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s1, s1, s3
; CI-NEXT: s_min_i32 s0, s0, s2
@@ -1388,9 +1319,6 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s1, s1, s3
; VI-NEXT: s_min_i32 s0, s0, s2
@@ -1463,9 +1391,6 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, 8
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1478,9 +1403,6 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1546,9 +1468,6 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, 8
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1561,9 +1480,6 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1641,13 +1557,10 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1666,13 +1579,10 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1776,15 +1686,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v2, s5
; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
@@ -1803,15 +1710,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
@@ -1934,15 +1838,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1973,15 +1874,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -2078,9 +1976,6 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_umin_ule_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2092,9 +1987,6 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_umin_ule_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2167,13 +2059,10 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2192,13 +2081,10 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2302,9 +2188,6 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s3
; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
@@ -2326,9 +2209,6 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0
@@ -2414,9 +2294,6 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_umin_ult_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2428,9 +2305,6 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_umin_ult_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2512,9 +2386,6 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
; CI-LABEL: v_test_umin_ult_i32_multi_use:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-NEXT: s_load_dword s5, s[6:7], 0x0
@@ -2536,9 +2407,6 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
; VI-LABEL: v_test_umin_ult_i32_multi_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s4, s[4:5], 0x0
; VI-NEXT: s_load_dword s5, s[6:7], 0x0
@@ -2666,9 +2534,6 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; CI-LABEL: v_test_umin_ult_i16_multi_use:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
@@ -2691,9 +2556,6 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; VI-LABEL: v_test_umin_ult_i16_multi_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
@@ -2784,9 +2646,6 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
; CI-LABEL: s_test_umin_ult_v1i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2798,9 +2657,6 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
; VI-LABEL: s_test_umin_ult_v1i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2870,9 +2726,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
;
; CI-LABEL: s_test_umin_ult_v8i32:
; CI: ; %bb.0:
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -2904,9 +2757,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
;
; VI-LABEL: s_test_umin_ult_v8i32:
; VI: ; %bb.0:
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -3071,9 +2921,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s10, s0, 16
; CI-NEXT: s_and_b32 s0, s0, 0xffff
@@ -3120,9 +2967,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s10, s3, 16
; VI-NEXT: s_and_b32 s3, s3, 0xffff
@@ -3244,14 +3088,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0xffff
; CI-NEXT: s_and_b32 s3, s3, 0xffff
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -3262,14 +3103,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -3357,14 +3195,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i16 s2, s2
; CI-NEXT: s_sext_i32_i16 s3, s3
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -3375,14 +3210,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: s_sext_i32_i16 s3, s3
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -3477,9 +3309,6 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i16 s3, s2
; CI-NEXT: s_ashr_i32 s2, s2, 16
@@ -3494,9 +3323,6 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s3, s2
; VI-NEXT: s_ashr_i32 s2, s2, 16
@@ -3577,9 +3403,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3598,9 +3421,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3690,9 +3510,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3711,9 +3528,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3803,9 +3617,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3824,9 +3635,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3916,9 +3724,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3937,9 +3742,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -4053,12 +3855,9 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_load_dword v4, v[0:1]
@@ -4087,13 +3886,10 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -4209,12 +4005,9 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_load_dword v4, v[0:1]
@@ -4242,13 +4035,10 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -4322,5 +4112,5 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index b1ce5a3423f20..f31f577e3896b 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -180,9 +180,6 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -263,9 +260,6 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -347,9 +341,6 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -412,9 +403,6 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -477,9 +465,6 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -542,9 +527,6 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -606,9 +588,6 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT: s_add_i32 s12, s12, s17
-; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -653,5 +632,5 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index 5803821a1d2c0..8b95b26f142db 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -176,9 +176,6 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1)
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX803-NEXT: s_add_i32 s12, s12, s17
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -257,9 +254,6 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX803-NEXT: s_add_i32 s12, s12, s17
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -339,9 +333,6 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 {
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX803-NEXT: s_add_i32 s12, s12, s17
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -402,9 +393,6 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX803-NEXT: s_add_i32 s12, s12, s17
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -466,9 +454,6 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 {
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX803-NEXT: s_add_i32 s12, s12, s17
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -529,9 +514,6 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX803-NEXT: s_add_i32 s12, s12, s17
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -575,6 +557,6 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index a75c04e435487..c942426bcc720 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -26,16 +26,16 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
;
; PEI-GFX908-LABEL: name: partial_copy
; PEI-GFX908: bb.0 (%ir-block.0):
- ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
+ ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
; PEI-GFX908-NEXT: {{ $}}
- ; PEI-GFX908-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
- ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
- ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+ ; PEI-GFX908-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+ ; PEI-GFX908-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+ ; PEI-GFX908-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
- ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+ ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
@@ -44,7 +44,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+ ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -70,16 +70,16 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
;
; PEI-GFX90A-LABEL: name: partial_copy
; PEI-GFX90A: bb.0 (%ir-block.0):
- ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
+ ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
; PEI-GFX90A-NEXT: {{ $}}
- ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
- ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
- ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+ ; PEI-GFX90A-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+ ; PEI-GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+ ; PEI-GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
- ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+ ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -87,7 +87,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+ ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
@@ -104,4 +104,4 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
-attributes #0 = { nounwind "amdgpu-num-vgpr"="5" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind "amdgpu-num-vgpr"="5" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index c26f0926d86b2..71e37bd7ee312 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -19,16 +19,16 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preload_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB0_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB0_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -54,16 +54,17 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: preload_unused_arg_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB1_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB1_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s12
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -180,7 +181,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: incorrect_type_i64_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB5_0
; GFX90a-NEXT: .p2align 8
@@ -190,7 +191,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i64, ptr addrspace(4) %imp_arg_ptr
@@ -216,7 +217,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: incorrect_type_i16_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB6_0
; GFX90a-NEXT: .p2align 8
@@ -226,7 +227,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i16, ptr addrspace(4) %imp_arg_ptr
@@ -251,15 +252,16 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preload_block_count_y:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB7_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB7_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
@@ -287,7 +289,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: random_incorrect_offset:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB8_0
; GFX90a-NEXT: .p2align 8
@@ -298,7 +300,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
@@ -325,16 +327,17 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preload_block_count_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB9_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB9_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s12
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
@@ -363,18 +366,19 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
;
; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB10_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB10_0:
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
-; GFX90a-NEXT: s_add_i32 s0, s12, s0
+; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: s_add_i32 s0, s10, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -404,18 +408,19 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_block_count_xyz:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB11_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB11_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: v_mov_b32_e32 v2, s10
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
@@ -449,17 +454,17 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_workgroup_size_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB12_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB12_0:
-; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
@@ -487,17 +492,17 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_workgroup_size_y:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB13_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB13_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
@@ -526,18 +531,18 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_workgroup_size_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB14_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB14_0:
-; GFX90a-NEXT: s_and_b32 s0, s14, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s12, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
@@ -570,22 +575,22 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
;
; GFX90a-LABEL: preload_workgroup_size_xyz:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB15_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB15_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT: s_and_b32 s1, s13, 0xffff
-; GFX90a-NEXT: s_and_b32 s2, s14, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
+; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff
+; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
@@ -623,18 +628,18 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-LABEL: preload_remainder_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB16_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB16_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s14, 16
+; GFX90a-NEXT: s_lshr_b32 s0, s12, 16
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
@@ -663,16 +668,18 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-LABEL: preloadremainder_y:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB17_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB17_0:
-; GFX90a-NEXT: s_and_b32 s0, s15, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
@@ -701,16 +708,18 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-LABEL: preloadremainder_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB18_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB18_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s15, 16
+; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
@@ -743,20 +752,22 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preloadremainder_xyz:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB19_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB19_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s15, 16
-; GFX90a-NEXT: s_lshr_b32 s1, s14, 16
-; GFX90a-NEXT: s_and_b32 s2, s15, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-NEXT: s_lshr_b32 s1, s12, 16
+; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
@@ -833,7 +844,10 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
;
; GFX90a-LABEL: preload_block_max_user_sgprs:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x20
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB21_0
; GFX90a-NEXT: .p2align 8
@@ -843,7 +857,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -873,23 +887,21 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt
;
; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
-; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB22_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB22_0:
-; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x1c
-; GFX90a-NEXT: s_and_b32 s1, s14, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_lshr_b32 s0, s0, 16
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 7ae0c11dca279..fe6378435a42e 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -21,17 +21,17 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0)
;
; GFX90a-LABEL: ptr1_i8:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB0_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB0_0:
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
+; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
@@ -56,17 +56,17 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero
;
; GFX90a-LABEL: ptr1_i8_zext_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB1_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB1_0:
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
+; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -91,17 +91,17 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16
;
; GFX90a-LABEL: ptr1_i16_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB2_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB2_0:
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -125,16 +125,16 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32
;
; GFX90a-LABEL: ptr1_i32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB3_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB3_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store i32 %arg0, ptr addrspace(1) %out
ret void
@@ -160,17 +160,18 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa
;
; GFX90a-LABEL: i32_ptr1_i32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB4_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB4_0:
-; GFX90a-NEXT: s_add_i32 s0, s8, s12
+; GFX90a-NEXT: s_add_i32 s0, s6, s10
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
@@ -197,19 +198,19 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: ptr1_i16_i16_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB5_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB5_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s10, 16
-; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff
; GFX90a-NEXT: s_add_i32 s0, s1, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
@@ -235,16 +236,16 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2
;
; GFX90a-LABEL: ptr1_v2i8_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB6_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB6_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store <2 x i8> %in, ptr addrspace(1) %out
ret void
@@ -273,7 +274,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
;
; GFX90a-LABEL: byref_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB7_0
; GFX90a-NEXT: .p2align 8
@@ -284,9 +285,9 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -319,7 +320,7 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: byref_staggered_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB8_0
; GFX90a-NEXT: .p2align 8
@@ -330,9 +331,9 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -369,26 +370,26 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x
;
; GFX90a-LABEL: v8i32_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB9_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB9_0:
-; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v0, s16
-; GFX90a-NEXT: v_mov_b32_e32 v1, s17
-; GFX90a-NEXT: v_mov_b32_e32 v2, s18
-; GFX90a-NEXT: v_mov_b32_e32 v3, s19
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
-; GFX90a-NEXT: s_nop 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-NEXT: s_nop 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: v_mov_b32_e32 v2, s10
+; GFX90a-NEXT: v_mov_b32_e32 v3, s11
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX90a-NEXT: s_endpgm
store <8 x i32> %in, ptr addrspace(1) %out, align 4
ret void
@@ -413,17 +414,18 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-LABEL: v3i16_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB10_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB10_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store <3 x i16> %in, ptr addrspace(1) %out, align 4
ret void
@@ -449,17 +451,19 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-LABEL: v3i32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB11_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB11_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
store <3 x i32> %in, ptr addrspace(1) %out, align 4
ret void
@@ -485,17 +489,19 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-LABEL: v3f32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB12_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB12_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
store <3 x float> %in, ptr addrspace(1) %out, align 4
ret void
@@ -527,24 +533,25 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou
;
; GFX90a-LABEL: v5i8_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB13_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB13_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s10, 24
+; GFX90a-NEXT: s_lshr_b32 s1, s8, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s0, s0, s1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_byte v0, v1, s[8:9] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: global_store_byte v0, v1, s[6:7] offset:4
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store <5 x i8> %in, ptr addrspace(1) %out, align 4
ret void
@@ -580,29 +587,29 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
;
; GFX90a-LABEL: v5f64_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB14_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB14_0:
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x40
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-NEXT: v_mov_b32_e32 v0, s16
-; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] offset:32
-; GFX90a-NEXT: v_mov_b32_e32 v1, s17
-; GFX90a-NEXT: v_mov_b32_e32 v2, s18
-; GFX90a-NEXT: v_mov_b32_e32 v3, s19
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
-; GFX90a-NEXT: s_nop 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-NEXT: s_nop 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: v_mov_b32_e32 v2, s10
+; GFX90a-NEXT: v_mov_b32_e32 v3, s11
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX90a-NEXT: s_endpgm
store <5 x double> %in, ptr addrspace(1) %out, align 8
ret void
@@ -640,30 +647,31 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8
;
; GFX90a-LABEL: v8i8_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB15_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB15_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s11, 24
+; GFX90a-NEXT: s_lshr_b32 s1, s9, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s11, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s2, s9, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_lshr_b32 s2, s10, 24
+; GFX90a-NEXT: s_lshr_b32 s2, s8, 24
; GFX90a-NEXT: s_lshl_b32 s2, s2, 8
-; GFX90a-NEXT: s_bfe_u32 s3, s10, 0x80010
-; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff
+; GFX90a-NEXT: s_bfe_u32 s3, s8, 0x80010
+; GFX90a-NEXT: s_and_b32 s0, s9, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s2, s3, s2
; GFX90a-NEXT: s_or_b32 s0, s0, s1
-; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff
+; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff
; GFX90a-NEXT: s_lshl_b32 s2, s2, 16
; GFX90a-NEXT: s_or_b32 s1, s1, s2
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-NEXT: s_endpgm
store <8 x i8> %in, ptr addrspace(1) %out
ret void
@@ -686,15 +694,16 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i
;
; GFX90a-LABEL: i64_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB16_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB16_0:
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-NEXT: s_endpgm
store i64 %a, ptr addrspace(1) %out, align 8
ret void
@@ -717,15 +726,16 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d
;
; GFX90a-LABEL: f64_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB17_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB17_0:
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-NEXT: s_endpgm
store double %in, ptr addrspace(1) %out
ret void
@@ -748,16 +758,16 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: half_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB18_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB18_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
ret void
@@ -780,16 +790,16 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out
;
; GFX90a-LABEL: bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB19_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB19_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store bfloat %in, ptr addrspace(1) %out
ret void
@@ -812,16 +822,16 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: v2bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB20_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB20_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store <2 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -846,17 +856,18 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: v3bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB21_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB21_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store <3 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -882,17 +893,19 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: v6bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB22_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB22_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
store <6 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -921,24 +934,24 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr
;
; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB23_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB23_0:
-; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: global_store_short v3, v0, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-NEXT: global_store_short v3, v0, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s13
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v0, s3
-; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12
-; GFX90a-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX90a-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
store <7 x bfloat> %in2, ptr addrspace(1) %out2
@@ -963,17 +976,17 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1
;
; GFX90a-LABEL: i1_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB24_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB24_0:
-; GFX90a-NEXT: s_and_b32 s0, s10, 1
+; GFX90a-NEXT: s_and_b32 s0, s8, 1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_byte v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_byte v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store i1 %in, ptr addrspace(1) %out
ret void
@@ -1000,18 +1013,20 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: fp128_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB25_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB25_0:
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v3, s13
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX90a-NEXT: s_endpgm
store fp128 %in, ptr addrspace(1) %out
ret void
@@ -1044,25 +1059,26 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: v7i8_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB26_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB26_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s10, 24
+; GFX90a-NEXT: s_lshr_b32 s1, s8, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s0, s0, s1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[8:9] offset:6
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[6:7] offset:6
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
store <7 x i8> %in, ptr addrspace(1) %out
ret void
@@ -1090,19 +1106,21 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out
;
; GFX90a-LABEL: v7half_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB27_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB27_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s15
-; GFX90a-NEXT: global_store_short v3, v0, s[8:9] offset:12
-; GFX90a-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s13
+; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
store <7 x half> %in, ptr addrspace(1) %out
ret void
@@ -1127,18 +1145,18 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %ou
;
; GFX90a-LABEL: i16_i32_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB28_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB28_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_dword v0, v1, s[12:13]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: global_store_dword v0, v1, s[10:11]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i32 %in2, ptr addrspace(1) %out2
@@ -1166,22 +1184,22 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %
;
; GFX90a-LABEL: i16_v3i32_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB29_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB29_0:
-; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
-; GFX90a-NEXT: v_mov_b32_e32 v4, s10
+; GFX90a-NEXT: v_mov_b32_e32 v4, s8
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: global_store_short v3, v4, s[6:7]
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NEXT: global_store_short v3, v4, s[8:9]
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <3 x i32> %in2, ptr addrspace(1) %out2
@@ -1206,17 +1224,17 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou
;
; GFX90a-LABEL: i16_i16_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB30_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB30_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
-; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[12:13]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[10:11]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i16 %in2, ptr addrspace(1) %out2
@@ -1246,22 +1264,22 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: i16_v2i8_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB31_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB31_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s10, 24
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 24
; GFX90a-NEXT: s_lshl_b32 s0, s0, 8
-; GFX90a-NEXT: s_bfe_u32 s1, s10, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s1, s8, 0x80010
; GFX90a-NEXT: s_or_b32 s0, s1, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_short v0, v1, s[12:13]
+; GFX90a-NEXT: global_store_short v0, v1, s[10:11]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <2 x i8> %in2, ptr addrspace(1) %out2
@@ -1290,7 +1308,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
;
; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB32_0
; GFX90a-NEXT: .p2align 8
@@ -1300,7 +1318,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_add_i32 s2, s8, s2
+; GFX90a-NEXT: s_add_i32 s2, s6, s2
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NEXT: s_endpgm
@@ -1327,16 +1345,17 @@ define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: ptr1_i8_trailing_unused:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB33_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB33_0:
-; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
+; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 8f25e6519588b..bc0b81749460f 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -1,14 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -earlycse-debug-hash -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
; GCN-LABEL: v_sad_u32_pat1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
@@ -30,18 +27,15 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b,
ret void
}
-define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) {
+define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) #0 {
; GCN-LABEL: v_sad_u32_constant_pat1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0x5a
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_sad_u32 v2, s2, v0, 20
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -58,14 +52,11 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a
ret void
}
-define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
; GCN-LABEL: v_sad_u32_pat2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
@@ -85,17 +76,15 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
ret void
}
-define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
; GCN-LABEL: v_sad_u32_multi_use_sub_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
-; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
+; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
+; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_add_u32 s20, s20, s17
-; GCN-NEXT: s_addc_u32 s21, s21, 0
+; GCN-NEXT: s_add_u32 s16, s16, s15
+; GCN-NEXT: s_addc_u32 s17, s17, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_min_u32 s3, s0, s1
; GCN-NEXT: s_max_u32 s0, s0, s1
@@ -103,9 +92,8 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_add_i32 s0, s0, s2
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
@@ -124,25 +112,22 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
ret void
}
-define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
; GCN-LABEL: v_sad_u32_multi_use_add_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
-; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
+; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
+; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_add_u32 s16, s16, s15
+; GCN-NEXT: s_addc_u32 s17, s17, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_mov_b32_e32 v3, s2
; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_sad_u32 v2, s0, v2, v3
-; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -159,27 +144,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i
ret void
}
-define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
; GCN-LABEL: v_sad_u32_multi_use_max_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
-; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
+; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
+; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_add_u32 s20, s20, s17
-; GCN-NEXT: s_addc_u32 s21, s21, 0
+; GCN-NEXT: s_add_u32 s16, s16, s15
+; GCN-NEXT: s_addc_u32 s17, s17, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_max_u32 s3, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_sad_u32 v3, s0, v0, v1
-; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_store_dword v[0:1], v3
; GCN-NEXT: s_endpgm
@@ -197,27 +179,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
ret void
}
-define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
; GCN-LABEL: v_sad_u32_multi_use_min_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
-; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
+; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
+; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_add_u32 s20, s20, s17
-; GCN-NEXT: s_addc_u32 s21, s21, 0
+; GCN-NEXT: s_add_u32 s16, s16, s15
+; GCN-NEXT: s_addc_u32 s17, s17, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_min_u32 s3, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_sad_u32 v3, s0, v0, v1
-; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_store_dword v[0:1], v3
; GCN-NEXT: s_endpgm
@@ -236,27 +215,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
ret void
}
-define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
; GCN-LABEL: v_sad_u32_multi_use_sub_pat2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
-; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
+; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
+; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_add_u32 s20, s20, s17
-; GCN-NEXT: s_addc_u32 s21, s21, 0
+; GCN-NEXT: s_add_u32 s16, s16, s15
+; GCN-NEXT: s_addc_u32 s17, s17, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s3, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_sad_u32 v3, s0, v0, v1
-; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_store_dword v[0:1], v3
; GCN-NEXT: s_endpgm
@@ -272,17 +248,15 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
ret void
}
-define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
; GCN-LABEL: v_sad_u32_multi_use_select_pat2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
-; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
+; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
+; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_add_u32 s20, s20, s17
-; GCN-NEXT: s_addc_u32 s21, s21, 0
+; GCN-NEXT: s_add_u32 s16, s16, s15
+; GCN-NEXT: s_addc_u32 s17, s17, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_min_u32 s3, s0, s1
; GCN-NEXT: s_max_u32 s0, s0, s1
@@ -290,9 +264,8 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_add_i32 s0, s0, s2
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
@@ -309,12 +282,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
; GCN-LABEL: v_sad_u32_vector_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc
; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
@@ -348,12 +318,9 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32
ret void
}
-define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
; GCN-LABEL: v_sad_u32_vector_pat2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc
; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
@@ -385,14 +352,12 @@ define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32
ret void
}
-define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) {
+define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) #0 {
; GCN-LABEL: v_sad_u32_i16_pat1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s4, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NEXT: s_lshr_b32 s0, s0, 16
@@ -400,7 +365,6 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_sad_u32 v2, s4, v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: flat_store_short v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -417,12 +381,9 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
ret void
}
-define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
+define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) #0 {
; GCN-LABEL: v_sad_u32_i16_pat2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -450,14 +411,11 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) {
+define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) #0 {
; GCN-LABEL: v_sad_u32_i8_pat1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s3, s2, 0xff
; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008
@@ -482,12 +440,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
ret void
}
-define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
+define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) #0 {
; GCN-LABEL: v_sad_u32_i8_pat2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -515,14 +470,11 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
+define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) #0 {
; GCN-LABEL: s_sad_u32_i8_pat2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s3, s2, 0xff
; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008
@@ -545,14 +497,11 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %
ret void
}
-define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) #0 {
; GCN-LABEL: v_sad_u32_mismatched_operands_pat1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_max_u32 s6, s0, s1
; GCN-NEXT: s_cmp_le_u32 s0, s1
@@ -577,14 +526,11 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %
ret void
}
-define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) #0 {
; GCN-LABEL: v_sad_u32_mismatched_operands_pat2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s3, s0, s3
; GCN-NEXT: s_sub_i32 s6, s1, s0
@@ -607,3 +553,4 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %
ret void
}
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index 29448ab2d822e..19a41a89b6ac6 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -9,8 +9,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v5, s3
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -26,8 +24,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v5, s3
; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -43,8 +39,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX908: ; %bb.0: ; %entry
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, s3
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -61,8 +55,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
-; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -96,8 +88,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v5, s3
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -113,8 +103,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v5, s3
; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -130,8 +118,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX908: ; %bb.0: ; %entry
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, s3
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -148,8 +134,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
-; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -180,5 +164,5 @@ entry:
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index 90dfd5a21d107..f072f68c67ab3 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -20,183 +20,179 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: ; def s[2:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
-; CHECK-NEXT: v_writelane_b32 v22, s2, 0
-; CHECK-NEXT: v_writelane_b32 v22, s3, 1
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[48:51]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[4:11]
+; CHECK-NEXT: ; def s[4:7]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_writelane_b32 v22, s2, 0
+; CHECK-NEXT: v_writelane_b32 v22, s3, 1
; CHECK-NEXT: v_writelane_b32 v22, s4, 2
; CHECK-NEXT: v_writelane_b32 v22, s5, 3
; CHECK-NEXT: v_writelane_b32 v22, s6, 4
+; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
; CHECK-NEXT: v_writelane_b32 v22, s7, 5
-; CHECK-NEXT: v_writelane_b32 v22, s8, 6
-; CHECK-NEXT: v_writelane_b32 v22, s9, 7
-; CHECK-NEXT: v_writelane_b32 v22, s10, 8
-; CHECK-NEXT: v_writelane_b32 v22, s11, 9
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[4:11]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_writelane_b32 v22, s4, 6
+; CHECK-NEXT: v_writelane_b32 v22, s5, 7
+; CHECK-NEXT: v_writelane_b32 v22, s6, 8
+; CHECK-NEXT: v_writelane_b32 v22, s7, 9
+; CHECK-NEXT: v_writelane_b32 v22, s8, 10
+; CHECK-NEXT: v_writelane_b32 v22, s9, 11
+; CHECK-NEXT: v_writelane_b32 v22, s10, 12
+; CHECK-NEXT: v_writelane_b32 v22, s11, 13
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[4:19]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s4, 10
-; CHECK-NEXT: v_writelane_b32 v22, s5, 11
-; CHECK-NEXT: v_writelane_b32 v22, s6, 12
-; CHECK-NEXT: v_writelane_b32 v22, s7, 13
-; CHECK-NEXT: v_writelane_b32 v22, s8, 14
-; CHECK-NEXT: v_writelane_b32 v22, s9, 15
-; CHECK-NEXT: v_writelane_b32 v22, s10, 16
-; CHECK-NEXT: v_writelane_b32 v22, s11, 17
-; CHECK-NEXT: v_writelane_b32 v22, s12, 18
-; CHECK-NEXT: v_writelane_b32 v22, s13, 19
-; CHECK-NEXT: v_writelane_b32 v22, s14, 20
-; CHECK-NEXT: v_writelane_b32 v22, s15, 21
-; CHECK-NEXT: v_writelane_b32 v22, s16, 22
-; CHECK-NEXT: v_writelane_b32 v22, s17, 23
-; CHECK-NEXT: v_writelane_b32 v22, s18, 24
-; CHECK-NEXT: v_writelane_b32 v22, s19, 25
+; CHECK-NEXT: v_writelane_b32 v22, s4, 14
+; CHECK-NEXT: v_writelane_b32 v22, s5, 15
+; CHECK-NEXT: v_writelane_b32 v22, s6, 16
+; CHECK-NEXT: v_writelane_b32 v22, s7, 17
+; CHECK-NEXT: v_writelane_b32 v22, s8, 18
+; CHECK-NEXT: v_writelane_b32 v22, s9, 19
+; CHECK-NEXT: v_writelane_b32 v22, s10, 20
+; CHECK-NEXT: v_writelane_b32 v22, s11, 21
+; CHECK-NEXT: v_writelane_b32 v22, s12, 22
+; CHECK-NEXT: v_writelane_b32 v22, s13, 23
+; CHECK-NEXT: v_writelane_b32 v22, s14, 24
+; CHECK-NEXT: v_writelane_b32 v22, s15, 25
+; CHECK-NEXT: v_writelane_b32 v22, s16, 26
+; CHECK-NEXT: v_writelane_b32 v22, s17, 27
+; CHECK-NEXT: v_writelane_b32 v22, s18, 28
+; CHECK-NEXT: v_writelane_b32 v22, s19, 29
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[38:39]
+; CHECK-NEXT: ; def s[42:43]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[44:47]
+; CHECK-NEXT: ; def s[52:55]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[4:11]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s4, 26
-; CHECK-NEXT: v_writelane_b32 v22, s5, 27
-; CHECK-NEXT: v_writelane_b32 v22, s6, 28
-; CHECK-NEXT: v_writelane_b32 v22, s7, 29
-; CHECK-NEXT: v_writelane_b32 v22, s8, 30
-; CHECK-NEXT: v_writelane_b32 v22, s9, 31
-; CHECK-NEXT: v_writelane_b32 v22, s10, 32
-; CHECK-NEXT: v_writelane_b32 v22, s11, 33
+; CHECK-NEXT: v_writelane_b32 v22, s4, 30
+; CHECK-NEXT: v_writelane_b32 v22, s5, 31
+; CHECK-NEXT: v_writelane_b32 v22, s6, 32
+; CHECK-NEXT: v_writelane_b32 v22, s7, 33
+; CHECK-NEXT: v_writelane_b32 v22, s8, 34
+; CHECK-NEXT: v_writelane_b32 v22, s9, 35
+; CHECK-NEXT: v_writelane_b32 v22, s10, 36
+; CHECK-NEXT: v_writelane_b32 v22, s11, 37
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[16:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[36:37]
+; CHECK-NEXT: ; def s[40:41]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[40:43]
+; CHECK-NEXT: ; def s[36:39]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[0:7]
+; CHECK-NEXT: ; def s[44:51]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s0, 34
-; CHECK-NEXT: v_writelane_b32 v22, s1, 35
-; CHECK-NEXT: v_writelane_b32 v22, s2, 36
-; CHECK-NEXT: v_writelane_b32 v22, s3, 37
-; CHECK-NEXT: v_writelane_b32 v22, s4, 38
-; CHECK-NEXT: v_writelane_b32 v22, s5, 39
-; CHECK-NEXT: v_writelane_b32 v22, s6, 40
-; CHECK-NEXT: v_writelane_b32 v22, s7, 41
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s0, 42
-; CHECK-NEXT: v_writelane_b32 v22, s1, 43
-; CHECK-NEXT: v_writelane_b32 v22, s2, 44
-; CHECK-NEXT: v_writelane_b32 v22, s3, 45
-; CHECK-NEXT: v_writelane_b32 v22, s4, 46
-; CHECK-NEXT: v_writelane_b32 v22, s5, 47
-; CHECK-NEXT: v_writelane_b32 v22, s6, 48
-; CHECK-NEXT: v_writelane_b32 v22, s7, 49
-; CHECK-NEXT: v_writelane_b32 v22, s8, 50
-; CHECK-NEXT: v_writelane_b32 v22, s9, 51
-; CHECK-NEXT: v_writelane_b32 v22, s10, 52
-; CHECK-NEXT: v_writelane_b32 v22, s11, 53
-; CHECK-NEXT: v_writelane_b32 v22, s12, 54
-; CHECK-NEXT: v_writelane_b32 v22, s13, 55
-; CHECK-NEXT: v_writelane_b32 v22, s14, 56
-; CHECK-NEXT: v_writelane_b32 v22, s15, 57
+; CHECK-NEXT: v_writelane_b32 v22, s0, 38
+; CHECK-NEXT: v_writelane_b32 v22, s1, 39
+; CHECK-NEXT: v_writelane_b32 v22, s2, 40
+; CHECK-NEXT: v_writelane_b32 v22, s3, 41
+; CHECK-NEXT: v_writelane_b32 v22, s4, 42
+; CHECK-NEXT: v_writelane_b32 v22, s5, 43
+; CHECK-NEXT: v_writelane_b32 v22, s6, 44
+; CHECK-NEXT: v_writelane_b32 v22, s7, 45
+; CHECK-NEXT: v_writelane_b32 v22, s8, 46
+; CHECK-NEXT: v_writelane_b32 v22, s9, 47
+; CHECK-NEXT: v_writelane_b32 v22, s10, 48
+; CHECK-NEXT: v_writelane_b32 v22, s11, 49
+; CHECK-NEXT: v_writelane_b32 v22, s12, 50
+; CHECK-NEXT: v_writelane_b32 v22, s13, 51
+; CHECK-NEXT: v_writelane_b32 v22, s14, 52
+; CHECK-NEXT: v_writelane_b32 v22, s15, 53
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[34:35]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s0, 58
-; CHECK-NEXT: v_writelane_b32 v22, s1, 59
-; CHECK-NEXT: v_writelane_b32 v22, s2, 60
-; CHECK-NEXT: v_writelane_b32 v22, s3, 61
+; CHECK-NEXT: v_writelane_b32 v22, s0, 54
+; CHECK-NEXT: v_writelane_b32 v22, s1, 55
+; CHECK-NEXT: v_writelane_b32 v22, s2, 56
+; CHECK-NEXT: v_writelane_b32 v22, s3, 57
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:7]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_writelane_b32 v22, s0, 58
+; CHECK-NEXT: v_writelane_b32 v22, s1, 59
+; CHECK-NEXT: v_writelane_b32 v22, s2, 60
; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
-; CHECK-NEXT: v_writelane_b32 v22, s0, 62
-; CHECK-NEXT: v_writelane_b32 v23, s2, 0
-; CHECK-NEXT: v_writelane_b32 v23, s3, 1
-; CHECK-NEXT: v_writelane_b32 v23, s4, 2
-; CHECK-NEXT: v_writelane_b32 v23, s5, 3
-; CHECK-NEXT: v_writelane_b32 v23, s6, 4
-; CHECK-NEXT: v_writelane_b32 v22, s1, 63
-; CHECK-NEXT: v_writelane_b32 v23, s7, 5
+; CHECK-NEXT: v_writelane_b32 v22, s3, 61
+; CHECK-NEXT: v_writelane_b32 v22, s4, 62
+; CHECK-NEXT: v_writelane_b32 v23, s6, 0
+; CHECK-NEXT: v_writelane_b32 v22, s5, 63
+; CHECK-NEXT: v_writelane_b32 v23, s7, 1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 6
-; CHECK-NEXT: v_writelane_b32 v23, s1, 7
-; CHECK-NEXT: v_writelane_b32 v23, s2, 8
-; CHECK-NEXT: v_writelane_b32 v23, s3, 9
-; CHECK-NEXT: v_writelane_b32 v23, s4, 10
-; CHECK-NEXT: v_writelane_b32 v23, s5, 11
-; CHECK-NEXT: v_writelane_b32 v23, s6, 12
-; CHECK-NEXT: v_writelane_b32 v23, s7, 13
-; CHECK-NEXT: v_writelane_b32 v23, s8, 14
-; CHECK-NEXT: v_writelane_b32 v23, s9, 15
-; CHECK-NEXT: v_writelane_b32 v23, s10, 16
-; CHECK-NEXT: v_writelane_b32 v23, s11, 17
-; CHECK-NEXT: v_writelane_b32 v23, s12, 18
-; CHECK-NEXT: v_writelane_b32 v23, s13, 19
-; CHECK-NEXT: v_writelane_b32 v23, s14, 20
-; CHECK-NEXT: v_writelane_b32 v23, s15, 21
+; CHECK-NEXT: v_writelane_b32 v23, s0, 2
+; CHECK-NEXT: v_writelane_b32 v23, s1, 3
+; CHECK-NEXT: v_writelane_b32 v23, s2, 4
+; CHECK-NEXT: v_writelane_b32 v23, s3, 5
+; CHECK-NEXT: v_writelane_b32 v23, s4, 6
+; CHECK-NEXT: v_writelane_b32 v23, s5, 7
+; CHECK-NEXT: v_writelane_b32 v23, s6, 8
+; CHECK-NEXT: v_writelane_b32 v23, s7, 9
+; CHECK-NEXT: v_writelane_b32 v23, s8, 10
+; CHECK-NEXT: v_writelane_b32 v23, s9, 11
+; CHECK-NEXT: v_writelane_b32 v23, s10, 12
+; CHECK-NEXT: v_writelane_b32 v23, s11, 13
+; CHECK-NEXT: v_writelane_b32 v23, s12, 14
+; CHECK-NEXT: v_writelane_b32 v23, s13, 15
+; CHECK-NEXT: v_writelane_b32 v23, s14, 16
+; CHECK-NEXT: v_writelane_b32 v23, s15, 17
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 22
-; CHECK-NEXT: v_writelane_b32 v23, s1, 23
+; CHECK-NEXT: v_writelane_b32 v23, s0, 18
+; CHECK-NEXT: v_writelane_b32 v23, s1, 19
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_writelane_b32 v23, s0, 20
+; CHECK-NEXT: v_writelane_b32 v23, s1, 21
+; CHECK-NEXT: v_writelane_b32 v23, s2, 22
+; CHECK-NEXT: v_writelane_b32 v23, s3, 23
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[0:7]
+; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_writelane_b32 v23, s0, 24
; CHECK-NEXT: v_writelane_b32 v23, s1, 25
; CHECK-NEXT: v_writelane_b32 v23, s2, 26
; CHECK-NEXT: v_writelane_b32 v23, s3, 27
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[0:7]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 28
-; CHECK-NEXT: v_writelane_b32 v23, s1, 29
-; CHECK-NEXT: v_writelane_b32 v23, s2, 30
-; CHECK-NEXT: v_writelane_b32 v23, s3, 31
-; CHECK-NEXT: v_writelane_b32 v23, s4, 32
-; CHECK-NEXT: v_writelane_b32 v23, s5, 33
-; CHECK-NEXT: v_writelane_b32 v23, s6, 34
-; CHECK-NEXT: v_writelane_b32 v23, s7, 35
+; CHECK-NEXT: v_writelane_b32 v23, s4, 28
+; CHECK-NEXT: v_writelane_b32 v23, s5, 29
+; CHECK-NEXT: v_writelane_b32 v23, s6, 30
+; CHECK-NEXT: v_writelane_b32 v23, s7, 31
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 36
-; CHECK-NEXT: v_writelane_b32 v23, s1, 37
-; CHECK-NEXT: v_writelane_b32 v23, s2, 38
-; CHECK-NEXT: v_writelane_b32 v23, s3, 39
-; CHECK-NEXT: v_writelane_b32 v23, s4, 40
-; CHECK-NEXT: v_writelane_b32 v23, s5, 41
-; CHECK-NEXT: v_writelane_b32 v23, s6, 42
-; CHECK-NEXT: v_writelane_b32 v23, s7, 43
-; CHECK-NEXT: v_writelane_b32 v23, s8, 44
-; CHECK-NEXT: v_writelane_b32 v23, s9, 45
-; CHECK-NEXT: v_writelane_b32 v23, s10, 46
-; CHECK-NEXT: v_writelane_b32 v23, s11, 47
-; CHECK-NEXT: v_writelane_b32 v23, s12, 48
-; CHECK-NEXT: v_writelane_b32 v23, s13, 49
-; CHECK-NEXT: v_writelane_b32 v23, s14, 50
-; CHECK-NEXT: v_writelane_b32 v23, s15, 51
+; CHECK-NEXT: v_writelane_b32 v23, s0, 32
+; CHECK-NEXT: v_writelane_b32 v23, s1, 33
+; CHECK-NEXT: v_writelane_b32 v23, s2, 34
+; CHECK-NEXT: v_writelane_b32 v23, s3, 35
+; CHECK-NEXT: v_writelane_b32 v23, s4, 36
+; CHECK-NEXT: v_writelane_b32 v23, s5, 37
+; CHECK-NEXT: v_writelane_b32 v23, s6, 38
+; CHECK-NEXT: v_writelane_b32 v23, s7, 39
+; CHECK-NEXT: v_writelane_b32 v23, s8, 40
+; CHECK-NEXT: v_writelane_b32 v23, s9, 41
+; CHECK-NEXT: v_writelane_b32 v23, s10, 42
+; CHECK-NEXT: v_writelane_b32 v23, s11, 43
+; CHECK-NEXT: v_writelane_b32 v23, s12, 44
+; CHECK-NEXT: v_writelane_b32 v23, s13, 45
+; CHECK-NEXT: v_writelane_b32 v23, s14, 46
+; CHECK-NEXT: v_writelane_b32 v23, s15, 47
; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %ret
; CHECK-NEXT: s_endpgm
@@ -210,170 +206,166 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: v_readlane_b32 s1, v22, 3
; CHECK-NEXT: v_readlane_b32 s2, v22, 4
; CHECK-NEXT: v_readlane_b32 s3, v22, 5
-; CHECK-NEXT: v_readlane_b32 s4, v22, 6
-; CHECK-NEXT: v_readlane_b32 s5, v22, 7
-; CHECK-NEXT: v_readlane_b32 s6, v22, 8
-; CHECK-NEXT: v_readlane_b32 s7, v22, 9
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[48:51]
+; CHECK-NEXT: ; use s[0:3]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_readlane_b32 s0, v22, 6
+; CHECK-NEXT: v_readlane_b32 s1, v22, 7
+; CHECK-NEXT: v_readlane_b32 s2, v22, 8
+; CHECK-NEXT: v_readlane_b32 s3, v22, 9
+; CHECK-NEXT: v_readlane_b32 s4, v22, 10
+; CHECK-NEXT: v_readlane_b32 s5, v22, 11
+; CHECK-NEXT: v_readlane_b32 s6, v22, 12
+; CHECK-NEXT: v_readlane_b32 s7, v22, 13
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 10
-; CHECK-NEXT: v_readlane_b32 s1, v22, 11
-; CHECK-NEXT: v_readlane_b32 s2, v22, 12
-; CHECK-NEXT: v_readlane_b32 s3, v22, 13
-; CHECK-NEXT: v_readlane_b32 s4, v22, 14
-; CHECK-NEXT: v_readlane_b32 s5, v22, 15
-; CHECK-NEXT: v_readlane_b32 s6, v22, 16
-; CHECK-NEXT: v_readlane_b32 s7, v22, 17
-; CHECK-NEXT: v_readlane_b32 s8, v22, 18
-; CHECK-NEXT: v_readlane_b32 s9, v22, 19
-; CHECK-NEXT: v_readlane_b32 s10, v22, 20
-; CHECK-NEXT: v_readlane_b32 s11, v22, 21
-; CHECK-NEXT: v_readlane_b32 s12, v22, 22
-; CHECK-NEXT: v_readlane_b32 s13, v22, 23
-; CHECK-NEXT: v_readlane_b32 s14, v22, 24
-; CHECK-NEXT: v_readlane_b32 s15, v22, 25
+; CHECK-NEXT: v_readlane_b32 s0, v22, 14
+; CHECK-NEXT: v_readlane_b32 s1, v22, 15
+; CHECK-NEXT: v_readlane_b32 s2, v22, 16
+; CHECK-NEXT: v_readlane_b32 s3, v22, 17
+; CHECK-NEXT: v_readlane_b32 s4, v22, 18
+; CHECK-NEXT: v_readlane_b32 s5, v22, 19
+; CHECK-NEXT: v_readlane_b32 s6, v22, 20
+; CHECK-NEXT: v_readlane_b32 s7, v22, 21
+; CHECK-NEXT: v_readlane_b32 s8, v22, 22
+; CHECK-NEXT: v_readlane_b32 s9, v22, 23
+; CHECK-NEXT: v_readlane_b32 s10, v22, 24
+; CHECK-NEXT: v_readlane_b32 s11, v22, 25
+; CHECK-NEXT: v_readlane_b32 s12, v22, 26
+; CHECK-NEXT: v_readlane_b32 s13, v22, 27
+; CHECK-NEXT: v_readlane_b32 s14, v22, 28
+; CHECK-NEXT: v_readlane_b32 s15, v22, 29
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 26
-; CHECK-NEXT: v_readlane_b32 s1, v22, 27
-; CHECK-NEXT: v_readlane_b32 s2, v22, 28
-; CHECK-NEXT: v_readlane_b32 s3, v22, 29
-; CHECK-NEXT: v_readlane_b32 s4, v22, 30
-; CHECK-NEXT: v_readlane_b32 s5, v22, 31
-; CHECK-NEXT: v_readlane_b32 s6, v22, 32
-; CHECK-NEXT: v_readlane_b32 s7, v22, 33
+; CHECK-NEXT: v_readlane_b32 s0, v22, 30
+; CHECK-NEXT: v_readlane_b32 s1, v22, 31
+; CHECK-NEXT: v_readlane_b32 s2, v22, 32
+; CHECK-NEXT: v_readlane_b32 s3, v22, 33
+; CHECK-NEXT: v_readlane_b32 s4, v22, 34
+; CHECK-NEXT: v_readlane_b32 s5, v22, 35
+; CHECK-NEXT: v_readlane_b32 s6, v22, 36
+; CHECK-NEXT: v_readlane_b32 s7, v22, 37
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[38:39]
+; CHECK-NEXT: ; use s[42:43]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[44:47]
+; CHECK-NEXT: ; use s[52:55]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 34
-; CHECK-NEXT: v_readlane_b32 s1, v22, 35
-; CHECK-NEXT: v_readlane_b32 s2, v22, 36
-; CHECK-NEXT: v_readlane_b32 s3, v22, 37
-; CHECK-NEXT: v_readlane_b32 s4, v22, 38
-; CHECK-NEXT: v_readlane_b32 s5, v22, 39
-; CHECK-NEXT: v_readlane_b32 s6, v22, 40
-; CHECK-NEXT: v_readlane_b32 s7, v22, 41
+; CHECK-NEXT: v_readlane_b32 s0, v22, 38
+; CHECK-NEXT: v_readlane_b32 s1, v22, 39
+; CHECK-NEXT: v_readlane_b32 s2, v22, 40
+; CHECK-NEXT: v_readlane_b32 s3, v22, 41
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[16:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[36:37]
+; CHECK-NEXT: ; use s[40:41]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[40:43]
+; CHECK-NEXT: ; use s[36:39]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[0:7]
+; CHECK-NEXT: ; use s[44:51]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 42
-; CHECK-NEXT: v_readlane_b32 s1, v22, 43
-; CHECK-NEXT: v_readlane_b32 s2, v22, 44
-; CHECK-NEXT: v_readlane_b32 s3, v22, 45
-; CHECK-NEXT: v_readlane_b32 s4, v22, 46
-; CHECK-NEXT: v_readlane_b32 s5, v22, 47
-; CHECK-NEXT: v_readlane_b32 s6, v22, 48
-; CHECK-NEXT: v_readlane_b32 s7, v22, 49
-; CHECK-NEXT: v_readlane_b32 s8, v22, 50
-; CHECK-NEXT: v_readlane_b32 s9, v22, 51
-; CHECK-NEXT: v_readlane_b32 s10, v22, 52
-; CHECK-NEXT: v_readlane_b32 s11, v22, 53
-; CHECK-NEXT: v_readlane_b32 s12, v22, 54
-; CHECK-NEXT: v_readlane_b32 s13, v22, 55
-; CHECK-NEXT: v_readlane_b32 s14, v22, 56
-; CHECK-NEXT: v_readlane_b32 s15, v22, 57
+; CHECK-NEXT: v_readlane_b32 s4, v22, 42
+; CHECK-NEXT: v_readlane_b32 s5, v22, 43
+; CHECK-NEXT: v_readlane_b32 s6, v22, 44
+; CHECK-NEXT: v_readlane_b32 s7, v22, 45
+; CHECK-NEXT: v_readlane_b32 s8, v22, 46
+; CHECK-NEXT: v_readlane_b32 s9, v22, 47
+; CHECK-NEXT: v_readlane_b32 s10, v22, 48
+; CHECK-NEXT: v_readlane_b32 s11, v22, 49
+; CHECK-NEXT: v_readlane_b32 s12, v22, 50
+; CHECK-NEXT: v_readlane_b32 s13, v22, 51
+; CHECK-NEXT: v_readlane_b32 s14, v22, 52
+; CHECK-NEXT: v_readlane_b32 s15, v22, 53
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 58
-; CHECK-NEXT: v_readlane_b32 s1, v22, 59
-; CHECK-NEXT: v_readlane_b32 s2, v22, 60
-; CHECK-NEXT: v_readlane_b32 s3, v22, 61
+; CHECK-NEXT: v_readlane_b32 s0, v22, 54
+; CHECK-NEXT: v_readlane_b32 s1, v22, 55
+; CHECK-NEXT: v_readlane_b32 s2, v22, 56
+; CHECK-NEXT: v_readlane_b32 s3, v22, 57
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[34:35]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 62
-; CHECK-NEXT: v_readlane_b32 s1, v22, 63
-; CHECK-NEXT: v_readlane_b32 s2, v23, 0
-; CHECK-NEXT: v_readlane_b32 s3, v23, 1
-; CHECK-NEXT: v_readlane_b32 s4, v23, 2
-; CHECK-NEXT: v_readlane_b32 s5, v23, 3
-; CHECK-NEXT: v_readlane_b32 s6, v23, 4
-; CHECK-NEXT: v_readlane_b32 s7, v23, 5
+; CHECK-NEXT: v_readlane_b32 s0, v22, 58
+; CHECK-NEXT: v_readlane_b32 s1, v22, 59
+; CHECK-NEXT: v_readlane_b32 s2, v22, 60
+; CHECK-NEXT: v_readlane_b32 s3, v22, 61
+; CHECK-NEXT: v_readlane_b32 s4, v22, 62
+; CHECK-NEXT: v_readlane_b32 s5, v22, 63
+; CHECK-NEXT: v_readlane_b32 s6, v23, 0
+; CHECK-NEXT: v_readlane_b32 s7, v23, 1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 6
-; CHECK-NEXT: v_readlane_b32 s1, v23, 7
-; CHECK-NEXT: v_readlane_b32 s2, v23, 8
-; CHECK-NEXT: v_readlane_b32 s3, v23, 9
-; CHECK-NEXT: v_readlane_b32 s4, v23, 10
-; CHECK-NEXT: v_readlane_b32 s5, v23, 11
-; CHECK-NEXT: v_readlane_b32 s6, v23, 12
-; CHECK-NEXT: v_readlane_b32 s7, v23, 13
-; CHECK-NEXT: v_readlane_b32 s8, v23, 14
-; CHECK-NEXT: v_readlane_b32 s9, v23, 15
-; CHECK-NEXT: v_readlane_b32 s10, v23, 16
-; CHECK-NEXT: v_readlane_b32 s11, v23, 17
-; CHECK-NEXT: v_readlane_b32 s12, v23, 18
-; CHECK-NEXT: v_readlane_b32 s13, v23, 19
-; CHECK-NEXT: v_readlane_b32 s14, v23, 20
-; CHECK-NEXT: v_readlane_b32 s15, v23, 21
+; CHECK-NEXT: v_readlane_b32 s0, v23, 2
+; CHECK-NEXT: v_readlane_b32 s1, v23, 3
+; CHECK-NEXT: v_readlane_b32 s2, v23, 4
+; CHECK-NEXT: v_readlane_b32 s3, v23, 5
+; CHECK-NEXT: v_readlane_b32 s4, v23, 6
+; CHECK-NEXT: v_readlane_b32 s5, v23, 7
+; CHECK-NEXT: v_readlane_b32 s6, v23, 8
+; CHECK-NEXT: v_readlane_b32 s7, v23, 9
+; CHECK-NEXT: v_readlane_b32 s8, v23, 10
+; CHECK-NEXT: v_readlane_b32 s9, v23, 11
+; CHECK-NEXT: v_readlane_b32 s10, v23, 12
+; CHECK-NEXT: v_readlane_b32 s11, v23, 13
+; CHECK-NEXT: v_readlane_b32 s12, v23, 14
+; CHECK-NEXT: v_readlane_b32 s13, v23, 15
+; CHECK-NEXT: v_readlane_b32 s14, v23, 16
+; CHECK-NEXT: v_readlane_b32 s15, v23, 17
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 22
-; CHECK-NEXT: v_readlane_b32 s1, v23, 23
+; CHECK-NEXT: v_readlane_b32 s0, v23, 18
+; CHECK-NEXT: v_readlane_b32 s1, v23, 19
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_readlane_b32 s0, v23, 20
+; CHECK-NEXT: v_readlane_b32 s1, v23, 21
+; CHECK-NEXT: v_readlane_b32 s2, v23, 22
+; CHECK-NEXT: v_readlane_b32 s3, v23, 23
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:3]
+; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_readlane_b32 s0, v23, 24
; CHECK-NEXT: v_readlane_b32 s1, v23, 25
; CHECK-NEXT: v_readlane_b32 s2, v23, 26
; CHECK-NEXT: v_readlane_b32 s3, v23, 27
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[0:3]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 28
-; CHECK-NEXT: v_readlane_b32 s1, v23, 29
-; CHECK-NEXT: v_readlane_b32 s2, v23, 30
-; CHECK-NEXT: v_readlane_b32 s3, v23, 31
-; CHECK-NEXT: v_readlane_b32 s4, v23, 32
-; CHECK-NEXT: v_readlane_b32 s5, v23, 33
-; CHECK-NEXT: v_readlane_b32 s6, v23, 34
-; CHECK-NEXT: v_readlane_b32 s7, v23, 35
+; CHECK-NEXT: v_readlane_b32 s4, v23, 28
+; CHECK-NEXT: v_readlane_b32 s5, v23, 29
+; CHECK-NEXT: v_readlane_b32 s6, v23, 30
+; CHECK-NEXT: v_readlane_b32 s7, v23, 31
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 36
-; CHECK-NEXT: v_readlane_b32 s1, v23, 37
-; CHECK-NEXT: v_readlane_b32 s2, v23, 38
-; CHECK-NEXT: v_readlane_b32 s3, v23, 39
-; CHECK-NEXT: v_readlane_b32 s4, v23, 40
-; CHECK-NEXT: v_readlane_b32 s5, v23, 41
-; CHECK-NEXT: v_readlane_b32 s6, v23, 42
-; CHECK-NEXT: v_readlane_b32 s7, v23, 43
-; CHECK-NEXT: v_readlane_b32 s8, v23, 44
-; CHECK-NEXT: v_readlane_b32 s9, v23, 45
-; CHECK-NEXT: v_readlane_b32 s10, v23, 46
-; CHECK-NEXT: v_readlane_b32 s11, v23, 47
-; CHECK-NEXT: v_readlane_b32 s12, v23, 48
-; CHECK-NEXT: v_readlane_b32 s13, v23, 49
-; CHECK-NEXT: v_readlane_b32 s14, v23, 50
-; CHECK-NEXT: v_readlane_b32 s15, v23, 51
+; CHECK-NEXT: v_readlane_b32 s0, v23, 32
+; CHECK-NEXT: v_readlane_b32 s1, v23, 33
+; CHECK-NEXT: v_readlane_b32 s2, v23, 34
+; CHECK-NEXT: v_readlane_b32 s3, v23, 35
+; CHECK-NEXT: v_readlane_b32 s4, v23, 36
+; CHECK-NEXT: v_readlane_b32 s5, v23, 37
+; CHECK-NEXT: v_readlane_b32 s6, v23, 38
+; CHECK-NEXT: v_readlane_b32 s7, v23, 39
+; CHECK-NEXT: v_readlane_b32 s8, v23, 40
+; CHECK-NEXT: v_readlane_b32 s9, v23, 41
+; CHECK-NEXT: v_readlane_b32 s10, v23, 42
+; CHECK-NEXT: v_readlane_b32 s11, v23, 43
+; CHECK-NEXT: v_readlane_b32 s12, v23, 44
+; CHECK-NEXT: v_readlane_b32 s13, v23, 45
+; CHECK-NEXT: v_readlane_b32 s14, v23, 46
+; CHECK-NEXT: v_readlane_b32 s15, v23, 47
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
@@ -434,4 +426,4 @@ ret:
}
attributes #0 = { nounwind }
-attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" }
+attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index 65a17ed67481c..461500c8e740c 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
+define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) #0 {
; GCN-LABEL: v_shl_i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26,7 +26,7 @@ define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
ret i128 %shl
}
-define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) {
+define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) #0 {
; GCN-LABEL: v_lshr_i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52,7 +52,7 @@ define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) {
ret i128 %shl
}
-define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) {
+define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) #0 {
; GCN-LABEL: v_ashr_i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -79,7 +79,7 @@ define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) {
}
-define i128 @v_shl_i128_vk(i128 %lhs) {
+define i128 @v_shl_i128_vk(i128 %lhs) #0 {
; GCN-LABEL: v_shl_i128_vk:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -93,7 +93,7 @@ define i128 @v_shl_i128_vk(i128 %lhs) {
ret i128 %shl
}
-define i128 @v_lshr_i128_vk(i128 %lhs) {
+define i128 @v_lshr_i128_vk(i128 %lhs) #0 {
; GCN-LABEL: v_lshr_i128_vk:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -106,7 +106,7 @@ define i128 @v_lshr_i128_vk(i128 %lhs) {
ret i128 %shl
}
-define i128 @v_ashr_i128_vk(i128 %lhs) {
+define i128 @v_ashr_i128_vk(i128 %lhs) #0 {
; GCN-LABEL: v_ashr_i128_vk:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -120,7 +120,7 @@ define i128 @v_ashr_i128_vk(i128 %lhs) {
ret i128 %shl
}
-define i128 @v_shl_i128_kv(i128 %rhs) {
+define i128 @v_shl_i128_kv(i128 %rhs) #0 {
; GCN-LABEL: v_shl_i128_kv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -142,7 +142,7 @@ define i128 @v_shl_i128_kv(i128 %rhs) {
ret i128 %shl
}
-define i128 @v_lshr_i128_kv(i128 %rhs) {
+define i128 @v_lshr_i128_kv(i128 %rhs) #0 {
; GCN-LABEL: v_lshr_i128_kv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -161,7 +161,7 @@ define i128 @v_lshr_i128_kv(i128 %rhs) {
ret i128 %shl
}
-define i128 @v_ashr_i128_kv(i128 %rhs) {
+define i128 @v_ashr_i128_kv(i128 %rhs) #0 {
; GCN-LABEL: v_ashr_i128_kv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -178,14 +178,12 @@ define i128 @v_ashr_i128_kv(i128 %rhs) {
ret i128 %shl
}
-define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
+define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) #0 {
; GCN-LABEL: s_shl_i128_ss:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s5, s4, 64
; GCN-NEXT: s_sub_i32 s12, 64, s4
@@ -205,7 +203,6 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -214,14 +211,12 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
ret void
}
-define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
+define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) #0 {
; GCN-LABEL: s_lshr_i128_ss:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s5, s4, 64
; GCN-NEXT: s_sub_i32 s12, 64, s4
@@ -241,7 +236,6 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -250,14 +244,12 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
ret void
}
-define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
+define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) #0 {
; GCN-LABEL: s_ashr_i128_ss:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s5, 64, s4
; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
@@ -278,7 +270,6 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -287,7 +278,7 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
ret void
}
-define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
; GCN-LABEL: v_shl_v2i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -336,7 +327,7 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
ret <2 x i128> %shl
}
-define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
; GCN-LABEL: v_lshr_v2i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -385,7 +376,7 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
ret <2 x i128> %shl
}
-define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
; GCN-LABEL: v_ashr_v2i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -436,12 +427,9 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
ret <2 x i128> %shl
}
-define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
+define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
; GCN-LABEL: s_shl_v2i128ss:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v6, 16
; GCN-NEXT: v_mov_b32_e32 v4, 0
@@ -511,12 +499,9 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
ret void
}
-define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
+define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
; GCN-LABEL: s_lshr_v2i128_ss:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v6, 16
; GCN-NEXT: v_mov_b32_e32 v4, 0
@@ -586,12 +571,9 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
ret void
}
-define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
+define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
; GCN-LABEL: s_ashr_v2i128_ss:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v6, 16
; GCN-NEXT: v_mov_b32_e32 v4, 0
@@ -663,3 +645,4 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
ret void
}
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 46f257eff1f24..0c029db96f558 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -4,14 +4,11 @@
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) {
+define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) #0 {
; CI-LABEL: sint_to_fp_i32_to_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -23,9 +20,6 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -39,19 +33,16 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
; uses an SGPR (implicit vcc).
-define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
+define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) #0 {
; CI-LABEL: sint_to_fp_i1_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -61,14 +52,11 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -79,14 +67,11 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
ret void
}
-define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) {
+define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) #0 {
; CI-LABEL: sint_to_fp_i1_f64_load:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_bitcmp1_b32 s2, 0
; CI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -101,9 +86,6 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s2, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -118,13 +100,10 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
ret void
}
-define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) #0 {
; CI-LABEL: s_sint_to_fp_i64_to_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3
; CI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -138,9 +117,6 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; VI-LABEL: s_sint_to_fp_i64_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -155,14 +131,11 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
ret void
}
-define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; CI-LABEL: v_sint_to_fp_i64_to_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
@@ -182,9 +155,6 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -208,14 +178,11 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
}
; FIXME: bfe and sext on VI+
-define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) {
+define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) #0 {
; CI-LABEL: s_sint_to_fp_i8_to_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i8 s2, s2
; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
@@ -228,9 +195,6 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s2, s2, 0x80000
; VI-NEXT: s_sext_i32_i16 s2, s2
@@ -244,7 +208,7 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
ret void
}
-define double @v_sint_to_fp_i8_to_f64(i8 %in) {
+define double @v_sint_to_fp_i8_to_f64(i8 %in) #0 {
; CI-LABEL: v_sint_to_fp_i8_to_f64:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -263,19 +227,16 @@ define double @v_sint_to_fp_i8_to_f64(i8 %in) {
ret double %fp
}
-define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
+define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
; CI-LABEL: s_select_sint_to_fp_i1_vals_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -285,14 +246,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -303,7 +261,7 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
ret void
}
-define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
+define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
; GCN-LABEL: v_select_sint_to_fp_i1_vals_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -320,19 +278,16 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
ret void
}
-define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
+define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) #0 {
; CI-LABEL: s_select_sint_to_fp_i1_vals_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -342,14 +297,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -360,7 +312,7 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
ret void
}
-define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
+define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) #0 {
; GCN-LABEL: v_select_sint_to_fp_i1_vals_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -378,7 +330,7 @@ define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
}
; TODO: This should swap the selected order / invert the compare and do it.
-define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
+define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
; GCN-LABEL: v_swap_select_sint_to_fp_i1_vals_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -396,19 +348,16 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in
}
; TODO: This should swap the selected order / invert the compare and do it.
-define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
+define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
; CI-LABEL: s_swap_select_sint_to_fp_i1_vals_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0, 0xbff00000
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -418,14 +367,11 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0, 0xbff00000
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -435,3 +381,5 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
store double %select, ptr addrspace(1) %out, align 8
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 9974d78af7ddf..0689bb4fb75eb 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %14.sub0
+ ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %13.sub0
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %13
@@ -27,4 +27,4 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
-attributes #0 = { nounwind "amdgpu-num-vgpr"="5" }
+attributes #0 = { nounwind "amdgpu-num-vgpr"="5" "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index ef92cf3214e7f..0e67d7c6530c8 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -50,10 +50,7 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 {
define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
; HAWAII-LABEL: local_store_i55:
; HAWAII: ; %bb.0:
-; HAWAII-NEXT: s_add_i32 s12, s12, s17
; HAWAII-NEXT: s_or_b32 s0, s8, 14
-; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13
-; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s9
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
@@ -73,10 +70,7 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
;
; FIJI-LABEL: local_store_i55:
; FIJI: ; %bb.0:
-; FIJI-NEXT: s_add_i32 s12, s12, s17
; FIJI-NEXT: s_or_b32 s0, s8, 14
-; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s9
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
@@ -386,4 +380,4 @@ define void @local_store_i17(ptr addrspace(3) %ptr, i17 %arg) #0 {
ret void
}
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 30accc846d2b6..d9613e7cda9c9 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
+; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @...............
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 9
@@ -23,7 +23,7 @@ entry:
ret void
}
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 4f84b31f1877b..94bb08d24153a 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
+; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000 ................
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 5
@@ -23,7 +23,7 @@ entry:
ret void
}
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 644f434923368..82ba126bc0962 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
+; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @...............
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 9
@@ -23,7 +23,7 @@ entry:
ret void
}
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 69cc63eba6243..43f028cf8649c 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -9,7 +9,7 @@
declare void @llvm.trap() #0
declare void @llvm.debugtrap() #1
-define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
+define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #2 {
; NOHSA-TRAP-GFX900-LABEL: trap:
; NOHSA-TRAP-GFX900: ; %bb.0:
; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -23,14 +23,11 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
; HSA-TRAP-GFX803-LABEL: trap:
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
-; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
-; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7]
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3
-; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7]
; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX803-NEXT: s_trap 2
@@ -103,7 +100,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
ret void
}
-define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr #2 {
; NOHSA-TRAP-GFX900-LABEL: non_entry_trap:
; NOHSA-TRAP-GFX900: ; %bb.0: ; %entry
; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -124,9 +121,6 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
; HSA-TRAP-GFX803-LABEL: non_entry_trap:
; HSA-TRAP-GFX803: ; %bb.0: ; %entry
; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
-; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1
@@ -267,7 +261,7 @@ ret:
ret void
}
-define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) #2 {
; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after:
; NOHSA-TRAP-GFX900: ; %bb.0:
; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -286,9 +280,6 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7]
; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
-; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
-; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5
@@ -403,7 +394,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
ret void
}
-define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
+define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) #2 {
; NOHSA-TRAP-GFX900-LABEL: debugtrap:
; NOHSA-TRAP-GFX900: ; %bb.0:
; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -420,13 +411,10 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
; HSA-TRAP-GFX803-LABEL: debugtrap:
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
-; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
-; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1
; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
@@ -496,6 +484,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
attributes #0 = { nounwind noreturn }
attributes #1 = { nounwind }
+attributes #2 = { "amdgpu-no-flat-scratch-init" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index b1111876f0280..93dda473ffd82 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -5,7 +5,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GFX1030
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
-define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: udiv_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -81,9 +81,6 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-LABEL: udiv_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -185,7 +182,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
ret void
}
-define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
; SI-LABEL: s_udiv_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -255,9 +252,6 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
; GCN-LABEL: s_udiv_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3
; GCN-NEXT: s_sub_i32 s4, 0, s3
@@ -349,7 +343,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
; The code generated by udiv is long and complex and may frequently
; change. The goal of this test is to make sure the ISel doesn't fail
; when it gets a v4i32 udiv
-define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: udiv_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -463,9 +457,6 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: udiv_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -625,7 +616,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}
-define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: udiv_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -819,9 +810,6 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: udiv_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s2, 16
; GCN-NEXT: s_addc_u32 s5, s3, 0
@@ -1107,7 +1095,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}
-define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: udiv_i32_div_pow2:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1147,9 +1135,6 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
; GCN-LABEL: udiv_i32_div_pow2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1195,7 +1180,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
ret void
}
-define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: udiv_i32_div_k_even:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1239,9 +1224,6 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: udiv_i32_div_k_even:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1292,7 +1274,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
ret void
}
-define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: udiv_i32_div_k_odd:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1336,9 +1318,6 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; GCN-LABEL: udiv_i32_div_k_odd:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1389,7 +1368,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
ret void
}
-define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: v_udiv_i8:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1451,9 +1430,6 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; GCN-LABEL: v_udiv_i8:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1532,7 +1508,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
ret void
}
-define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: v_udiv_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1594,9 +1570,6 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: v_udiv_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1675,7 +1648,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}
-define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: v_udiv_i23:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1753,9 +1726,6 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: v_udiv_i23:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s2, 4
; GCN-NEXT: s_addc_u32 s5, s3, 0
@@ -1875,7 +1845,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}
-define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: v_udiv_i24:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1953,9 +1923,6 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: v_udiv_i24:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s2, 4
; GCN-NEXT: s_addc_u32 s5, s3, 0
@@ -2078,7 +2045,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}
-define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) #0 {
; SI-LABEL: scalarize_mulhu_4xi32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -2138,9 +2105,6 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; GCN-LABEL: scalarize_mulhu_4xi32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
@@ -2226,7 +2190,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
ret void
}
-define amdgpu_kernel void @test_udiv2(i32 %p) {
+define amdgpu_kernel void @test_udiv2(i32 %p) #0 {
; SI-LABEL: test_udiv2:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s0, s[4:5], 0x9
@@ -2254,9 +2218,6 @@ define amdgpu_kernel void @test_udiv2(i32 %p) {
; GCN-LABEL: test_udiv2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshr_b32 s0, s0, 1
; GCN-NEXT: v_mov_b32_e32 v0, s0
@@ -2289,7 +2250,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) {
ret void
}
-define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
+define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) #0 {
; SI-LABEL: test_udiv_3_mulhu:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s0, s[4:5], 0x9
@@ -2320,9 +2281,6 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0
@@ -2358,7 +2316,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
ret void
}
-define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readonly %arg) {
+define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readonly %arg) #0 {
; SI-LABEL: fdiv_test_denormals:
; SI: ; %bb.0: ; %bb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
@@ -2529,7 +2487,7 @@ bb:
ret void
}
-define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
+define i64 @v_test_udiv64_mulhi_fold(i64 %arg) #0 {
; SI-LABEL: v_test_udiv64_mulhi_fold:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2618,3 +2576,5 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
%d = udiv i64 %arg, 100000
ret i64 %d
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 97738a7944741..56f74f59b711a 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -4,14 +4,11 @@
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; SI-LABEL: v_uint_to_fp_i64_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
@@ -31,9 +28,6 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -56,13 +50,10 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) #0 {
; SI-LABEL: s_uint_to_fp_i64_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -76,9 +67,6 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; VI-LABEL: s_uint_to_fp_i64_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -93,14 +81,11 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 x i64> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 x i64> %in) #0 {
; SI-LABEL: s_uint_to_fp_v2i64_to_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1
@@ -118,9 +103,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1
@@ -141,14 +123,11 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) #0 {
; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8
; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s1
@@ -181,9 +160,6 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s7
; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s5
@@ -215,14 +191,11 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) #0 {
; SI-LABEL: s_uint_to_fp_i32_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; SI-NEXT: v_mov_b32_e32 v3, s1
@@ -234,9 +207,6 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -248,13 +218,10 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 x i32> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 x i32> %in) #0 {
; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3
; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
@@ -267,14 +234,11 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 x i32> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 x i32> %in) #0 {
; SI-LABEL: s_uint_to_fp_v4i32_to_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3
@@ -295,9 +259,6 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3
@@ -320,19 +281,16 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
; We can't fold the SGPRs into v_cndmask_b32_e32, because it already
; uses an SGPR (implicit vcc).
-define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) #0 {
; SI-LABEL: uint_to_fp_i1_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; SI-NEXT: v_mov_b32_e32 v3, s1
-; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -342,14 +300,11 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -360,14 +315,11 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
ret void
}
-define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %in) {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %in) #0 {
; SI-LABEL: uint_to_fp_i1_to_f64_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s2, 0
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -382,9 +334,6 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s2, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -399,14 +348,11 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) #0 {
; SI-LABEL: s_uint_to_fp_i8_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s2, s2, 0xff
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
@@ -419,9 +365,6 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xff
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
@@ -435,7 +378,7 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
}
; FIXME: Worse on VI
-define double @v_uint_to_fp_i8_to_f64(i8 %in) {
+define double @v_uint_to_fp_i8_to_f64(i8 %in) #0 {
; SI-LABEL: v_uint_to_fp_i8_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -454,19 +397,16 @@ define double @v_uint_to_fp_i8_to_f64(i8 %in) {
ret double %fp
}
-define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
+define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
; SI-LABEL: s_select_uint_to_fp_i1_vals_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; SI-NEXT: v_mov_b32_e32 v3, s1
-; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -476,14 +416,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -494,7 +431,7 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
ret void
}
-define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
+define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
; GCN-LABEL: v_select_uint_to_fp_i1_vals_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -511,19 +448,16 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
ret void
}
-define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
+define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) #0 {
; SI-LABEL: s_select_uint_to_fp_i1_vals_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; SI-NEXT: v_mov_b32_e32 v3, s1
-; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -533,14 +467,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -551,7 +482,7 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
ret void
}
-define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
+define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) #0 {
; GCN-LABEL: v_select_uint_to_fp_i1_vals_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -569,19 +500,16 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
}
; TODO: This should swap the selected order / invert the compare and do it.
-define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
+define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
; SI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: s_add_i32 s12, s12, s17
-; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000
; SI-NEXT: v_mov_b32_e32 v3, s1
-; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -591,14 +519,11 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -609,7 +534,7 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
ret void
}
-define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
+define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
; GCN-LABEL: v_swap_select_uint_to_fp_i1_vals_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -625,3 +550,5 @@ define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in
store double %select, ptr addrspace(1) %out, align 8
ret void
}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 3420707963db2..ab0f6e2511308 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -25,9 +25,8 @@
; CHECK-NEXT: argumentInfo:
; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; CHECK-NEXT: workGroupIDX: { reg: '$sgpr8' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
@@ -42,7 +41,7 @@
; CHECK-NEXT: BitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
@@ -95,7 +94,7 @@
; Function Attrs: convergent nocallback nofree nounwind willreturn
declare void @llvm.amdgcn.end.cf.i64(i64) #2
- attributes #0 = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+ attributes #0 = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { convergent nocallback nofree nounwind willreturn }
attributes #3 = { convergent nocallback nofree nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 720631a301192..d4be0efc6ed9d 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -25,9 +25,8 @@
; CHECK-NEXT: argumentInfo:
; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; CHECK-NEXT: workGroupIDX: { reg: '$sgpr8' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
@@ -42,7 +41,7 @@
; CHECK-NEXT: BitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
@@ -69,5 +68,5 @@ bb4:
ret void
}
-attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
attributes #1 = { nounwind readnone }
>From b225a48422d196e1daa4656cf9b75fd91e5ef53e Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Thu, 13 Mar 2025 17:00:02 -0700
Subject: [PATCH 4/6] Remove the pass from AMDGPUPassRegistry.def
---
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 6 ------
1 file changed, 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 6832a17c37177..29b7886fa08e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -148,9 +148,3 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-regbanklegalize", AMDGPURegBankLegalizePass(
DUMMY_MACHINE_FUNCTION_PASS("amdgpu-regbank-combiner", AMDGPURegBankCombinerPass())
#undef DUMMY_MACHINE_FUNCTION_PASS
-
-
-#define DUMMY_CGSCC_PASS(NAME, CREATE_PASS)
-DUMMY_CGSCC_PASS("amdgpu-annotate-kernel-features", AMDGPUAnnotateKernelFeaturesPass())
-
-#undef DUMMY_CGSCC_PASS
>From c34d81aa3f60b532fe602b5c50f0135212855684 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Mon, 24 Mar 2025 11:10:56 -0700
Subject: [PATCH 5/6] Update tests.
Undo the changes in previous commit. Now the amdgpu-no-flatscratch-init
attribute is only manually added if the tests are relevant to the
attribute.
---
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 4 +-
.../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 370 +++-
.../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 392 ++++-
.../AMDGPU/GlobalISel/extractelement.ll | 79 +-
.../GlobalISel/insertelement-stack-lower.ll | 4 +-
.../AMDGPU/GlobalISel/lds-global-value.ll | 7 +-
.../GlobalISel/llvm.amdgcn.if.break.i64.ll | 7 +-
.../GlobalISel/llvm.amdgcn.trig.preop.ll | 29 +-
.../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 57 +-
.../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 57 +-
.../abi-attribute-hints-undefined-behavior.ll | 42 +-
llvm/test/CodeGen/AMDGPU/always-uniform.ll | 7 +-
...amdgpu-codegenprepare-fold-binop-select.ll | 6 +-
.../AMDGPU/attr-amdgpu-waves-per-eu.ll | 26 +-
.../attributor-flatscratchinit-invalid.ll | 547 +-----
.../attributor-flatscratchinit-invalid2.ll | 313 ++++
.../CodeGen/AMDGPU/combine-reg-or-const.ll | 5 +-
...dagcomb-extract-vec-elt-different-sizes.ll | 6 +-
.../expand-scalar-carry-out-select-user.ll | 7 +-
.../CodeGen/AMDGPU/extract_vector_elt-i8.ll | 102 +-
llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 81 +-
.../fast-unaligned-load-store.global.ll | 21 +-
llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 250 ++-
.../AMDGPU/fmul-2-combine-multi-use.ll | 50 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 82 +-
.../CodeGen/AMDGPU/fneg-modifier-casting.ll | 93 +-
llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 64 +-
llvm/test/CodeGen/AMDGPU/half.ll | 233 ++-
llvm/test/CodeGen/AMDGPU/hsa.ll | 6 +-
.../AMDGPU/insert_vector_elt.v2bf16.ll | 68 +-
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 228 ++-
.../CodeGen/AMDGPU/invalid-addrspacecast.ll | 7 +-
.../CodeGen/AMDGPU/invalid-cast-load-i1.ll | 5 +-
.../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll | 18 +-
.../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll | 18 +-
.../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 39 +-
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 85 +-
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 116 +-
.../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 128 +-
llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 10 +-
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 143 +-
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 85 +-
llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 20 +-
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 166 +-
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 148 +-
llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 107 +-
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 304 ++--
.../AMDGPU/memory-legalizer-flat-agent.ll | 1566 +++++++++++++++-
.../memory-legalizer-flat-nontemporal.ll | 86 +-
.../memory-legalizer-flat-singlethread.ll | 1567 ++++++++++++++++-
.../AMDGPU/memory-legalizer-flat-system.ll | 1566 +++++++++++++++-
.../AMDGPU/memory-legalizer-flat-volatile.ll | 80 +-
.../AMDGPU/memory-legalizer-flat-wavefront.ll | 1549 +++++++++++++++-
.../AMDGPU/memory-legalizer-flat-workgroup.ll | 1498 +++++++++++++++-
.../AMDGPU/memory-legalizer-global-agent.ll | 457 ++++-
.../memory-legalizer-global-nontemporal.ll | 27 +-
.../memory-legalizer-global-singlethread.ll | 462 ++++-
.../AMDGPU/memory-legalizer-global-system.ll | 437 ++++-
.../memory-legalizer-global-volatile.ll | 31 +-
.../memory-legalizer-global-wavefront.ll | 462 ++++-
.../memory-legalizer-global-workgroup.ll | 462 ++++-
.../memory-legalizer-local-nontemporal.ll | 20 +-
.../AMDGPU/memory-legalizer-local-volatile.ll | 19 +-
.../memory-legalizer-private-nontemporal.ll | 70 +-
.../memory-legalizer-private-volatile.ll | 39 +-
llvm/test/CodeGen/AMDGPU/min.ll | 212 ++-
llvm/test/CodeGen/AMDGPU/pack.v2f16.ll | 23 +-
llvm/test/CodeGen/AMDGPU/pack.v2i16.ll | 20 +-
...al-regcopy-and-spill-missed-at-regalloc.ll | 51 +-
.../AMDGPU/preload-implicit-kernargs.ll | 170 +-
llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 379 ++--
llvm/test/CodeGen/AMDGPU/sad.ll | 151 +-
.../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll | 18 +-
.../scc-clobbered-sgpr-to-vmem-spill.ll | 466 ++---
llvm/test/CodeGen/AMDGPU/shift-i128.ll | 61 +-
llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 98 +-
.../CodeGen/AMDGPU/spill-vector-superclass.ll | 8 +-
llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 8 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll | 4 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll | 4 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll | 4 +-
llvm/test/CodeGen/AMDGPU/trap-abis.ll | 25 +-
llvm/test/CodeGen/AMDGPU/udiv.ll | 79 +-
llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 127 +-
84 files changed, 14124 insertions(+), 2824 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid2.ll
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 85c8b4183166a..a538c62748009 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -628,9 +628,7 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
(IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
- // The line below: If enableFlatScratch() is true, whether
- // no-flat-scratch-init is set is not important. If enableFlatScratch()
- // is false, FlatScratchInit cannot be true for graphics CC.
+ // FlatScratchInit cannot be true for graphics CC.
(ST.enableFlatScratch() ||
(!IsNoFlatScratchInitSet && !AMDGPU::isGraphics(CC))) &&
!ST.flatScratchIsArchitected()) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index 8654d0f789fac..ac24f81136fd6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -97,11 +103,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -112,11 +121,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -287,6 +299,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_dec_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -302,6 +317,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_dec_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -359,6 +377,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_dec_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -376,6 +397,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_dec_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -436,6 +460,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
; CI-LABEL: global_atomic_dec_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -453,6 +480,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
; VI-LABEL: global_atomic_dec_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -513,6 +543,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_dec_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -525,6 +558,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_dec_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -575,6 +611,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_dec_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -589,6 +628,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_dec_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -642,6 +684,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
; CI-LABEL: global_atomic_dec_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -656,6 +701,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
; VI-LABEL: global_atomic_dec_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -710,7 +758,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -718,6 +768,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -732,7 +783,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -740,6 +793,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -802,6 +856,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -819,6 +876,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -878,6 +938,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -893,6 +956,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -908,6 +974,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -922,6 +990,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_ret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -958,6 +1030,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -975,6 +1050,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -992,6 +1070,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1006,6 +1086,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1045,6 +1129,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_dec_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -1062,6 +1149,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_dec_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -1079,6 +1169,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1093,6 +1185,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1132,6 +1228,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1144,6 +1243,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1156,6 +1258,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -1167,6 +1271,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1199,6 +1307,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -1213,6 +1324,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -1227,6 +1341,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -1238,6 +1354,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1273,6 +1393,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -1287,6 +1410,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -1301,6 +1427,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -1312,6 +1440,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1348,7 +1480,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1356,6 +1490,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -1370,7 +1505,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1378,6 +1515,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -1392,6 +1530,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1410,6 +1550,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 42
@@ -1466,6 +1610,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -1483,6 +1630,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1500,6 +1650,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
@@ -1513,6 +1665,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1559,10 +1715,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1580,10 +1739,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1601,7 +1763,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_ret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -1616,6 +1780,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_ret_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1654,12 +1822,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1677,12 +1848,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1700,7 +1874,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_ret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -1715,6 +1891,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_ret_i64_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1756,10 +1936,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1769,10 +1952,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1782,7 +1968,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -1794,6 +1982,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1828,12 +2020,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1843,12 +2038,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1858,7 +2056,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -1870,6 +2070,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i64_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1907,12 +2111,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1922,12 +2129,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1937,7 +2147,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -1949,6 +2161,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1987,6 +2203,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2013,6 +2232,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2039,12 +2261,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2058,6 +2282,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2116,6 +2344,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -2134,6 +2365,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2152,12 +2386,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2166,6 +2402,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2219,8 +2459,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_store_dword v[0:1], v3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2237,8 +2480,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2312,7 +2558,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -2328,7 +2577,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -2394,7 +2646,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -2410,7 +2665,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -2594,10 +2852,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_dec_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2610,10 +2871,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_dec_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2671,12 +2935,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_dec_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2689,12 +2956,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_dec_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2753,12 +3023,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
; CI-LABEL: global_atomic_dec_ret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2771,12 +3044,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
; VI-LABEL: global_atomic_dec_ret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2835,10 +3111,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_dec_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2848,10 +3127,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_dec_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2902,12 +3184,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_dec_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2917,12 +3202,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_dec_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2974,12 +3262,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
; CI-LABEL: global_atomic_dec_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2989,12 +3280,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
; VI-LABEL: global_atomic_dec_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -3047,6 +3341,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -3070,6 +3367,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -3144,6 +3444,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -3162,6 +3465,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -3232,7 +3538,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v4, s3
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: flat_store_dword v[3:4], v0
@@ -3251,7 +3560,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dword v[3:4], v0
@@ -3319,7 +3631,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
}
attributes #0 = { nounwind speculatable willreturn memory(none) }
-attributes #1 = { nounwind "amdgpu-no-flat-scratch-init"}
+attributes #1 = { nounwind }
attributes #2 = { nounwind memory(none) }
!0 = !{i32 5, i32 6}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index 626ca2690f5fd..23c267e7d184e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -21,11 +21,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -36,11 +39,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -110,11 +116,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -125,11 +134,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -332,6 +344,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_inc_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -347,6 +362,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_inc_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -415,6 +433,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_inc_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -432,6 +453,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_inc_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -503,6 +527,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -520,6 +547,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -592,6 +622,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_inc_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -604,6 +637,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_inc_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -664,6 +700,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_inc_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -678,6 +717,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_inc_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -741,6 +783,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
; CI-LABEL: global_atomic_inc_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -755,6 +800,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
; VI-LABEL: global_atomic_inc_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -820,7 +868,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -828,6 +878,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -842,7 +893,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -850,6 +903,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -925,6 +979,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -942,6 +999,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1019,8 +1079,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_store_dword v[0:1], v3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1037,8 +1100,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1129,7 +1195,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -1145,7 +1214,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1224,7 +1296,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -1240,7 +1315,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1459,10 +1537,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_inc_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1475,10 +1556,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_inc_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1548,12 +1632,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_inc_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1566,12 +1653,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_inc_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1642,12 +1732,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
; CI-LABEL: global_atomic_inc_ret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1660,12 +1753,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
; VI-LABEL: global_atomic_inc_ret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1737,10 +1833,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_inc_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1750,10 +1849,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_inc_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1815,12 +1917,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_inc_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1830,12 +1935,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_inc_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1898,12 +2006,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
; CI-LABEL: global_atomic_inc_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1913,12 +2024,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
; VI-LABEL: global_atomic_inc_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1983,6 +2097,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2006,6 +2123,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2094,6 +2214,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -2112,6 +2235,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2188,6 +2314,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -2203,6 +2332,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -2218,6 +2350,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2232,6 +2366,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_ret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2281,6 +2419,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -2298,6 +2439,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -2315,6 +2459,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2329,6 +2475,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_ret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2381,6 +2531,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_inc_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -2398,6 +2551,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_inc_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -2415,6 +2571,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2429,6 +2587,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2482,6 +2644,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2494,6 +2659,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2506,6 +2674,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -2517,6 +2687,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2560,6 +2734,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -2574,6 +2751,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -2588,6 +2768,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -2599,6 +2781,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2645,6 +2831,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -2659,6 +2848,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -2673,6 +2865,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -2684,6 +2878,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2732,7 +2930,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2740,6 +2940,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -2754,7 +2955,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2762,6 +2965,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -2776,6 +2980,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2794,6 +3000,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 42
@@ -2871,6 +3081,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -2888,6 +3101,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2905,6 +3121,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
@@ -2918,6 +3136,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2988,7 +3210,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v4, s3
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: flat_store_dword v[3:4], v0
@@ -3007,7 +3232,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dword v[3:4], v0
@@ -3097,10 +3325,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3118,10 +3349,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3139,7 +3373,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_ret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -3154,6 +3390,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_ret_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3206,12 +3446,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3229,12 +3472,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3252,7 +3498,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_ret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -3267,6 +3515,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_ret_i64_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3322,12 +3574,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_inc_ret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3345,12 +3600,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_inc_ret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3368,7 +3626,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -3383,6 +3643,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3439,10 +3703,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3452,10 +3719,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3465,7 +3735,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -3477,6 +3749,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3523,12 +3799,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3538,12 +3817,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3553,7 +3835,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -3565,6 +3849,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3614,12 +3902,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3629,12 +3920,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3644,7 +3938,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -3656,6 +3952,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3707,6 +4007,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -3733,6 +4036,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -3759,12 +4065,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3778,6 +4086,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -3858,6 +4170,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -3876,6 +4191,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -3894,12 +4212,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3908,6 +4228,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -3975,6 +4299,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s4
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0
@@ -3982,6 +4307,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -3995,6 +4322,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0
@@ -4002,6 +4330,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -4089,7 +4419,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
}
attributes #0 = { nounwind speculatable willreturn memory(none) }
-attributes #1 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #1 = { nounwind }
attributes #2 = { nounwind memory(none) }
!0 = !{i32 5, i32 6}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 1e3163e584ce1..9ef16aef0dd16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3004,7 +3004,7 @@ entry:
ret double %ext
}
-define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel) #0 {
+define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel) {
; GPRIDX-LABEL: dyn_extract_v5f64_s_s:
; GPRIDX: .amd_kernel_code_t
; GPRIDX-NEXT: amd_code_version_major = 1
@@ -3016,7 +3016,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -3027,7 +3027,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT: user_sgpr_count = 12
+; GPRIDX-NEXT: user_sgpr_count = 14
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -3042,7 +3042,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1
; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1
; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 15
+; GPRIDX-NEXT: wavefront_sgpr_count = 17
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -3107,7 +3107,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -3118,7 +3118,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT: user_sgpr_count = 12
+; MOVREL-NEXT: user_sgpr_count = 14
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -3133,7 +3133,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_sgpr_queue_ptr = 1
; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; MOVREL-NEXT: enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1
; MOVREL-NEXT: enable_sgpr_private_segment_size = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -3150,7 +3150,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 10
+; MOVREL-NEXT: wavefront_sgpr_count = 24
; MOVREL-NEXT: workitem_vgpr_count = 4
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
@@ -3168,21 +3168,24 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; MOVREL-NEXT: s_load_dword s8, s[8:9], 0x8
+; MOVREL-NEXT: s_add_i32 s12, s12, s17
+; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; MOVREL-NEXT: s_mov_b32 s4, 0
; MOVREL-NEXT: s_mov_b32 s5, 0x40080000
-; MOVREL-NEXT: s_mov_b32 s2, 0
-; MOVREL-NEXT: s_mov_b32 s3, 0x40140000
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s8, 1
; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
; MOVREL-NEXT: s_cmp_eq_u32 s8, 2
+; MOVREL-NEXT: s_mov_b32 s2, 0
; MOVREL-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
; MOVREL-NEXT: s_cmp_eq_u32 s8, 3
+; MOVREL-NEXT: s_mov_b32 s3, 0x40140000
; MOVREL-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5]
; MOVREL-NEXT: s_cmp_eq_u32 s8, 4
; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; MOVREL-NEXT: v_mov_b32_e32 v0, s2
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
+; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13
; MOVREL-NEXT: v_mov_b32_e32 v1, s3
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -3210,7 +3213,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT: user_sgpr_count = 12
+; GFX10-NEXT: user_sgpr_count = 14
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -3225,7 +3228,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_sgpr_queue_ptr = 1
; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GFX10-NEXT: enable_sgpr_dispatch_id = 1
-; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1
; GFX10-NEXT: enable_sgpr_private_segment_size = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4019,7 +4022,7 @@ entry:
ret float %ext
}
-define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %sel) #0 {
+define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %sel) {
; GPRIDX-LABEL: dyn_extract_v4f32_s_s_s:
; GPRIDX: .amd_kernel_code_t
; GPRIDX-NEXT: amd_code_version_major = 1
@@ -4042,7 +4045,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT: user_sgpr_count = 12
+; GPRIDX-NEXT: user_sgpr_count = 14
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4057,7 +4060,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1
; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1
; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4074,7 +4077,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 14
+; GPRIDX-NEXT: wavefront_sgpr_count = 16
; GPRIDX-NEXT: workitem_vgpr_count = 2
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -4115,7 +4118,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4126,7 +4129,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT: user_sgpr_count = 12
+; MOVREL-NEXT: user_sgpr_count = 14
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4141,7 +4144,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_sgpr_queue_ptr = 1
; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; MOVREL-NEXT: enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1
; MOVREL-NEXT: enable_sgpr_private_segment_size = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4158,7 +4161,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 10
+; MOVREL-NEXT: wavefront_sgpr_count = 24
; MOVREL-NEXT: workitem_vgpr_count = 3
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
@@ -4176,6 +4179,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dword s2, s[8:9], 0x8
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; MOVREL-NEXT: s_add_i32 s12, s12, s17
+; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s2, 1
; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0
@@ -4211,7 +4217,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT: user_sgpr_count = 12
+; GFX10-NEXT: user_sgpr_count = 14
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4226,7 +4232,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_sgpr_queue_ptr = 1
; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GFX10-NEXT: enable_sgpr_dispatch_id = 1
-; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1
; GFX10-NEXT: enable_sgpr_private_segment_size = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4364,7 +4370,7 @@ entry:
ret void
}
-define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %sel) #0 {
+define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %sel) {
; GPRIDX-LABEL: dyn_extract_v4f64_s_s_s:
; GPRIDX: .amd_kernel_code_t
; GPRIDX-NEXT: amd_code_version_major = 1
@@ -4387,7 +4393,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT: user_sgpr_count = 12
+; GPRIDX-NEXT: user_sgpr_count = 14
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4402,7 +4408,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1
; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1
; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4419,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 14
+; GPRIDX-NEXT: wavefront_sgpr_count = 16
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -4463,7 +4469,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4474,7 +4480,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT: user_sgpr_count = 12
+; MOVREL-NEXT: user_sgpr_count = 14
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4489,7 +4495,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_sgpr_queue_ptr = 1
; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; MOVREL-NEXT: enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1
; MOVREL-NEXT: enable_sgpr_private_segment_size = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4506,7 +4512,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 10
+; MOVREL-NEXT: wavefront_sgpr_count = 24
; MOVREL-NEXT: workitem_vgpr_count = 4
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
@@ -4524,10 +4530,12 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dword s6, s[8:9], 0x8
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; MOVREL-NEXT: s_add_i32 s12, s12, s17
+; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; MOVREL-NEXT: s_mov_b32 s2, 0
-; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s6, 1
+; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
; MOVREL-NEXT: s_cmp_eq_u32 s6, 2
; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
@@ -4535,6 +4543,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
; MOVREL-NEXT: v_mov_b32_e32 v0, s2
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
+; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13
; MOVREL-NEXT: v_mov_b32_e32 v1, s3
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -4562,7 +4571,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT: user_sgpr_count = 12
+; GFX10-NEXT: user_sgpr_count = 14
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4577,7 +4586,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_sgpr_queue_ptr = 1
; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GFX10-NEXT: enable_sgpr_dispatch_id = 1
-; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1
; GFX10-NEXT: enable_sgpr_private_segment_size = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4859,5 +4868,3 @@ define i32 @v_extract_v64i32_37(ptr addrspace(1) %ptr) {
%elt = extractelement <64 x i32> %vec, i32 37
ret i32 %elt
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index b025ec3e1da4d..94853767ccfac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[20:23], s[8:9], 0x0
; GCN-NEXT: s_load_dwordx2 s[24:25], s[8:9], 0x10
-; GCN-NEXT: s_add_u32 s0, s0, s15
+; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: v_mov_b32_e32 v64, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -256,4 +256,4 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
ret void
}
-attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,10" "amdgpu-no-flat-scratch-init" }
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,10" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
index 11afcda8fbc53..859f7ef16e395 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -11,13 +11,16 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace(
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 4
; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_add_i32 s12, s12, s17
; CHECK-NEXT: ds_read_b32 v2, v0
-; CHECK-NEXT: v_mov_b32_e32 v3, 9
+; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_add_u32 s0, s0, 4
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_mov_b32_e32 v3, 9
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: v_mov_b32_e32 v0, 0x200
; CHECK-NEXT: ds_write_b32 v0, v3
@@ -31,4 +34,4 @@ entry:
ret void
}
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
index e719270c8620d..a5a75f74833f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
@@ -1,11 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) #0 {
+define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) {
; GCN-LABEL: test_wave64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s2, s[8:9], 0x0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0xa
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 1, 0
@@ -25,5 +28,3 @@ entry:
}
declare i64 @llvm.amdgcn.if.break.i64(i1, i64)
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
index 35fb563844865..1deee215e522b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
@@ -37,11 +37,14 @@ define double @v_trig_preop_f64_imm(double %a, i32 %b) {
ret double %result
}
-define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) #1 {
+define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
; CI-LABEL: s_trig_preop_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -59,6 +62,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) #1 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -76,6 +82,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) #1 {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -85,6 +93,10 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) #1 {
;
; GFX10-LABEL: s_trig_preop_f64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
@@ -109,10 +121,13 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) #1 {
ret void
}
-define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) #1 {
+define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
; CI-LABEL: s_trig_preop_f64_imm:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; CI-NEXT: s_add_u32 s0, s0, 4
@@ -128,6 +143,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) #1 {
; VI-LABEL: s_trig_preop_f64_imm:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; VI-NEXT: s_add_u32 s0, s0, 4
@@ -143,6 +161,8 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) #1 {
; GFX9-LABEL: s_trig_preop_f64_imm:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
@@ -151,6 +171,10 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) #1 {
;
; GFX10-LABEL: s_trig_preop_f64_imm:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
@@ -174,4 +198,3 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) #1 {
declare double @llvm.amdgcn.trig.preop.f64(double, i32) #0
attributes #0 = { nounwind readnone speculatable }
-attributes #1 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 0817f325a7fe5..b59f85b2dfa38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -3,10 +3,13 @@
; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) {
; GFX8-LABEL: sdivrem_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s6, s5, 31
; GFX8-NEXT: s_add_i32 s0, s5, s6
@@ -142,10 +145,13 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
; GFX8-LABEL: sdivrem_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s2, s9, 31
; GFX8-NEXT: s_ashr_i32 s12, s11, 31
@@ -613,10 +619,13 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) #0 {
+define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
; GFX8-LABEL: sdivrem_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s2, s10, 31
; GFX8-NEXT: s_add_i32 s0, s10, s2
@@ -842,9 +851,12 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) #0 {
+define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
; GFX8-LABEL: sdivrem_v4i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1268,9 +1280,12 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) #0 {
+define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
; GFX8-LABEL: sdivrem_v2i64:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2183,10 +2198,13 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) #0 {
+define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) {
; GFX8-LABEL: sdiv_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -2328,10 +2346,13 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) #0 {
+define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) {
; GFX8-LABEL: sdivrem_v2i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010
; GFX8-NEXT: s_ashr_i32 s3, s0, 31
@@ -2592,10 +2613,13 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) #0 {
+define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) {
; GFX8-LABEL: sdiv_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -2737,10 +2761,13 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
ret void
}
-define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) #0 {
+define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) {
; GFX8-LABEL: sdivrem_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sext_i32_i16 s0, s3
; GFX8-NEXT: s_ashr_i32 s10, s0, 31
@@ -2998,10 +3025,13 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) #0 {
+define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) {
; GFX8-LABEL: sdivrem_i3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -3149,10 +3179,13 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
ret void
}
-define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) #0 {
+define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) {
; GFX8-LABEL: sdivrem_i27:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -3299,5 +3332,3 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
store i27 %rem, ptr addrspace(1) %out1
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 4e8d82003ddb3..ff0114cfc3ddb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -3,10 +3,13 @@
; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) {
; GFX8-LABEL: udivrem_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
; GFX8-NEXT: s_sub_i32 s0, 0, s5
@@ -109,10 +112,13 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
; GFX8-LABEL: udivrem_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10
@@ -519,10 +525,13 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) #0 {
+define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
; GFX8-LABEL: udivrem_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11
@@ -682,9 +691,12 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) #0 {
+define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
; GFX8-LABEL: udivrem_v4i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -977,9 +989,12 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) #0 {
+define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
; GFX8-LABEL: udivrem_v2i64:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x20
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1768,10 +1783,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) #0 {
+define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) {
; GFX8-LABEL: udiv_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
@@ -1880,11 +1898,14 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) #0 {
+define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) {
; GFX8-LABEL: udivrem_v2i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s0, s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
@@ -2077,10 +2098,13 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) #0 {
+define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) {
; GFX8-LABEL: udiv_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s5, s4, 16
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
@@ -2189,11 +2213,14 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
ret void
}
-define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) #0 {
+define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) {
; GFX8-LABEL: udivrem_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s2, s1, 0xffff
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
@@ -2383,10 +2410,13 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
ret void
}
-define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) #0 {
+define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) {
; GFX8-LABEL: udivrem_i3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
@@ -2501,10 +2531,13 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
ret void
}
-define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) #0 {
+define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) {
; GFX8-LABEL: udivrem_i27:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
@@ -2618,5 +2651,3 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
store i27 %rem, ptr addrspace(1) %out1
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 653e17f75df11..7a7863462357b 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -93,7 +93,7 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
}
; Function is marked with amdgpu-no-workitem-id-* but uses them anyway
-define void @marked_func_use_workitem_id(ptr addrspace(1) %ptr) #1 {
+define void @marked_func_use_workitem_id(ptr addrspace(1) %ptr) #0 {
; FIXEDABI-SDAG-LABEL: marked_func_use_workitem_id:
; FIXEDABI-SDAG: ; %bb.0:
; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -131,10 +131,13 @@ define void @marked_func_use_workitem_id(ptr addrspace(1) %ptr) #1 {
}
; Function is marked with amdgpu-no-workitem-id-* but uses them anyway
-define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr) #1 {
+define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr) #0 {
; FIXEDABI-LABEL: marked_kernel_use_workitem_id:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; FIXEDABI-NEXT: s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0)
; FIXEDABI-NEXT: v_mov_b32_e32 v4, s1
; FIXEDABI-NEXT: v_mov_b32_e32 v3, s0
@@ -154,7 +157,7 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr)
ret void
}
-define void @marked_func_use_workgroup_id(ptr addrspace(1) %ptr) #1 {
+define void @marked_func_use_workgroup_id(ptr addrspace(1) %ptr) #0 {
; FIXEDABI-LABEL: marked_func_use_workgroup_id:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -177,20 +180,23 @@ define void @marked_func_use_workgroup_id(ptr addrspace(1) %ptr) #1 {
ret void
}
-define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr) #1 {
+define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr) #0 {
; FIXEDABI-LABEL: marked_kernel_use_workgroup_id:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s6
+; FIXEDABI-NEXT: s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8
; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0)
; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0
; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1
; FIXEDABI-NEXT: flat_store_dword v[0:1], v2
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s7
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s9
; FIXEDABI-NEXT: flat_store_dword v[0:1], v2
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s10
; FIXEDABI-NEXT: flat_store_dword v[0:1], v2
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
; FIXEDABI-NEXT: s_endpgm
@@ -203,7 +209,7 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr)
ret void
}
-define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #1 {
+define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
; FIXEDABI-LABEL: marked_func_use_other_sgpr:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -235,9 +241,12 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #1 {
ret void
}
-define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #1 {
+define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
; FIXEDABI: ; %bb.0:
+; FIXEDABI-NEXT: s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; FIXEDABI-NEXT: s_add_u32 s0, s4, 8
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0
@@ -258,10 +267,13 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
ret void
}
-define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #1 {
+define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 {
; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr:
; FIXEDABI: ; %bb.0:
+; FIXEDABI-NEXT: s_add_i32 s4, s4, s9
; FIXEDABI-NEXT: v_mov_b32_e32 v0, 0
+; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-NEXT: v_mov_b32_e32 v1, 0
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
; FIXEDABI-NEXT: s_endpgm
@@ -326,7 +338,7 @@ define void @addrspacecast_requires_queue_ptr(ptr addrspace(5) %ptr.private, ptr
ret void
}
-define void @is_shared_requires_queue_ptr(ptr %ptr) #1 {
+define void @is_shared_requires_queue_ptr(ptr %ptr) #0 {
; FIXEDABI-LABEL: is_shared_requires_queue_ptr:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -344,7 +356,7 @@ define void @is_shared_requires_queue_ptr(ptr %ptr) #1 {
ret void
}
-define void @is_private_requires_queue_ptr(ptr %ptr) #1 {
+define void @is_private_requires_queue_ptr(ptr %ptr) #0 {
; FIXEDABI-LABEL: is_private_requires_queue_ptr:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -362,7 +374,7 @@ define void @is_private_requires_queue_ptr(ptr %ptr) #1 {
ret void
}
-define void @trap_requires_queue() #1 {
+define void @trap_requires_queue() #0 {
; FIXEDABI-LABEL: trap_requires_queue:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -374,7 +386,7 @@ define void @trap_requires_queue() #1 {
unreachable
}
-define void @debugtrap_requires_queue() #1 {
+define void @debugtrap_requires_queue() #0 {
; FIXEDABI-LABEL: debugtrap_requires_queue:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -400,7 +412,5 @@ declare void @llvm.debugtrap()
attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-work-group-id-x" "amdgpu-no-work-group-id-y" "amdgpu-no-work-group-id-z" "amdgpu-no-work-item-id-x" "amdgpu-no-work-item-id-y" "amdgpu-no-work-item-id-z" }
-attributes #1 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-work-group-id-x" "amdgpu-no-work-group-id-y" "amdgpu-no-work-group-id-z" "amdgpu-no-work-item-id-x" "amdgpu-no-work-item-id-y" "amdgpu-no-work-item-id-z" }
-
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index b88a616b80ef2..4e7022710c671 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -4,12 +4,14 @@
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.readfirstlane(i32)
-define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapture readonly, ptr addrspace(1) noalias nocapture readonly) #0 {
+define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapture readonly, ptr addrspace(1) noalias nocapture readonly) {
; GCN-LABEL: readfirstlane_uniform:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_mov_b32 s5, 0
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s0, s0, s4
@@ -18,6 +20,7 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
; GCN-NEXT: s_add_u32 s0, s2, 40
; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
@@ -32,5 +35,3 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
store float %val, ptr addrspace(1) %gep1, align 4
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 755449b3bce7b..e71bf15384727 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -384,7 +384,7 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) {
ret i32 %op
}
-define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) #1 {
+define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
; IR-LABEL: @select_add_lhs_const_i16(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 128, i16 131
; IR-NEXT: store i16 [[OP]], ptr addrspace(1) poison, align 2
@@ -393,6 +393,9 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) #1 {
; GCN-LABEL: select_add_lhs_const_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s0, 0
; GCN-NEXT: s_movk_i32 s0, 0x80
@@ -510,4 +513,3 @@ define <2 x half> @multi_use_cast_regression(i1 %cond) {
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #0
attributes #0 = { nounwind readnone speculatable }
-attributes #1 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index aec619b837a1f..4507fd5865989 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -10,7 +10,7 @@ define amdgpu_kernel void @empty_exactly_1() #0 {
entry:
ret void
}
-attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-flat-scratch-init"}
+attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,64" }
; Exactly 5 waves per execution unit.
; CHECK-LABEL: {{^}}empty_exactly_5:
@@ -22,7 +22,7 @@ define amdgpu_kernel void @empty_exactly_5() #1 {
entry:
ret void
}
-attributes #1 = {"amdgpu-waves-per-eu"="5,5" "amdgpu-no-flat-scratch-init"}
+attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
; Exactly 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_exactly_10:
@@ -34,7 +34,7 @@ define amdgpu_kernel void @empty_exactly_10() #2 {
entry:
ret void
}
-attributes #2 = {"amdgpu-waves-per-eu"="10,10" "amdgpu-no-flat-scratch-init"}
+attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
; At least 1 wave per execution unit.
; CHECK-LABEL: {{^}}empty_at_least_1:
@@ -46,7 +46,7 @@ define amdgpu_kernel void @empty_at_least_1() #3 {
entry:
ret void
}
-attributes #3 = {"amdgpu-waves-per-eu"="1" "amdgpu-no-flat-scratch-init"}
+attributes #3 = {"amdgpu-waves-per-eu"="1"}
; At least 5 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_least_5:
@@ -58,7 +58,7 @@ define amdgpu_kernel void @empty_at_least_5() #4 {
entry:
ret void
}
-attributes #4 = {"amdgpu-waves-per-eu"="5" "amdgpu-no-flat-scratch-init"}
+attributes #4 = {"amdgpu-waves-per-eu"="5"}
; At least 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_least_10:
@@ -70,7 +70,7 @@ define amdgpu_kernel void @empty_at_least_10() #5 {
entry:
ret void
}
-attributes #5 = {"amdgpu-waves-per-eu"="10" "amdgpu-no-flat-scratch-init"}
+attributes #5 = {"amdgpu-waves-per-eu"="10"}
; At most 1 wave per execution unit (same as @empty_exactly_1).
@@ -84,7 +84,7 @@ define amdgpu_kernel void @empty_at_most_5() #6 {
entry:
ret void
}
-attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-flat-scratch-init"}
+attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64"}
; At most 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_most_10:
@@ -96,7 +96,7 @@ define amdgpu_kernel void @empty_at_most_10() #7 {
entry:
ret void
}
-attributes #7 = {"amdgpu-waves-per-eu"="1,10" "amdgpu-no-flat-scratch-init"}
+attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
; Between 1 and 5 waves per execution unit (same as @empty_at_most_5).
@@ -110,15 +110,15 @@ define amdgpu_kernel void @empty_between_5_and_10() #8 {
entry:
ret void
}
-attributes #8 = {"amdgpu-waves-per-eu"="5,10" "amdgpu-no-flat-scratch-init"}
+attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
@var = addrspace(1) global float 0.0
; Exactly 10 waves per execution unit.
; CHECK-LABEL: {{^}}exactly_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 3
; CHECK: VGPRBlocks: 5
-; CHECK: NumSGPRsForWavesPerEU: 20
+; CHECK: NumSGPRsForWavesPerEU: 30
; CHECK: NumVGPRsForWavesPerEU: 24
define amdgpu_kernel void @exactly_10() #9 {
%val0 = load volatile float, ptr addrspace(1) @var
@@ -187,7 +187,7 @@ define amdgpu_kernel void @exactly_10() #9 {
ret void
}
-attributes #9 = {"amdgpu-waves-per-eu"="10,10" "amdgpu-no-flat-scratch-init"}
+attributes #9 = {"amdgpu-waves-per-eu"="10,10"}
; Exactly 256 workitems and exactly 2 waves.
; CHECK-LABEL: {{^}}empty_workitems_exactly_256_waves_exactly_2:
@@ -199,4 +199,4 @@ define amdgpu_kernel void @empty_workitems_exactly_256_waves_exactly_2() #10 {
entry:
ret void
}
-attributes #10 = {"amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="2,2" "amdgpu-no-flat-scratch-init"}
+attributes #10 = {"amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="2,2"}
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid.ll
index 8b816dfc28728..1b422252573db 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid.ll
@@ -1,550 +1,63 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s
;
-; These functions all should have the attribute amdgpu-no-flat-scratch-init set if the AMDGPUAttributor
-; pass is run. Therefore the purpose is to test llc when the attribute is incorrectly missing.
+; None of these functions should have the attribute amdgpu-no-flat-scratch-init. In these tests
+; we manually set the attribute for the functions. The purpose is to test how the amdgpu-attributor pass
+; handles this situation.
;
;; tests of addrspacecast
-define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: without_private_to_flat_addrspacecast:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: without_private_to_flat_addrspacecast:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store volatile i32 0, ptr addrspace(5) %ptr
+define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
+ %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+ store volatile i32 0, ptr %stof
ret void
}
-define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: without_private_to_flat_addrspacecast_cc_kernel:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[22:23], s[2:3]
-; GFX9-NEXT: s_mov_b64 s[20:21], s[0:1]
-; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 s20, s20, s17
-; GFX9-NEXT: s_addc_u32 s21, s21, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: buffer_store_dword v0, v1, s[20:23], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: without_private_to_flat_addrspacecast_cc_kernel:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_mov_b64 s[22:23], s[2:3]
-; GFX10-NEXT: s_mov_b64 s[20:21], s[0:1]
-; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_add_u32 s20, s20, s17
-; GFX10-NEXT: s_addc_u32 s21, s21, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-NEXT: buffer_store_dword v0, v1, s[20:23], 0 offen
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_endpgm
- store volatile i32 0, ptr addrspace(5) %ptr
+define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 {
+ %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+ store volatile i32 0, ptr %stof
ret void
}
-define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: call_without_private_to_flat_addrspacecast:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[16:17]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, without_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, without_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_writelane_b32 v2, s30, 0
-; GFX9-NEXT: v_writelane_b32 v2, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: v_readlane_b32 s31, v2, 1
-; GFX9-NEXT: v_readlane_b32 s30, v2, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_mov_b32 s33, s18
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: call_without_private_to_flat_addrspacecast:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s18, s33
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
-; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s16
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_getpc_b64 s[16:17]
-; GFX10-NEXT: s_add_u32 s16, s16, without_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s17, s17, without_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v2, s30, 0
-; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_writelane_b32 v2, s31, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX10-NEXT: v_readlane_b32 s31, v2, 1
-; GFX10-NEXT: v_readlane_b32 s30, v2, 0
-; GFX10-NEXT: s_mov_b32 s32, s33
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_mov_b32 s33, s18
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
+ call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
ret void
}
-define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: call_without_private_to_flat_addrspacecast_cc_kernel:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: s_add_u32 s0, s0, s17
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_load_dword s17, s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 s8, s8, 8
-; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_mov_b32 s13, s15
-; GFX9-NEXT: s_mov_b32 s12, s14
-; GFX9-NEXT: s_getpc_b64 s[14:15]
-; GFX9-NEXT: s_add_u32 s14, s14, without_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s15, s15, without_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX9-NEXT: s_mov_b32 s14, s16
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s17
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: call_without_private_to_flat_addrspacecast_cc_kernel:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_mov_b32 s32, 0
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; GFX10-NEXT: s_add_u32 s0, s0, s17
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_load_dword s17, s[8:9], 0x0
-; GFX10-NEXT: s_add_u32 s8, s8, 8
-; GFX10-NEXT: s_addc_u32 s9, s9, 0
-; GFX10-NEXT: s_mov_b32 s13, s15
-; GFX10-NEXT: s_mov_b32 s12, s14
-; GFX10-NEXT: s_getpc_b64 s[14:15]
-; GFX10-NEXT: s_add_u32 s14, s14, without_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s15, s15, without_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX10-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX10-NEXT: s_mov_b32 s14, s16
-; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s17
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX10-NEXT: s_endpgm
- call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 {
+ call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
ret void
}
-define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: call_call_without_private_to_flat_addrspacecast:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s19, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[16:17]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, call_without_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, call_without_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_writelane_b32 v3, s30, 0
-; GFX9-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: v_readlane_b32 s31, v3, 1
-; GFX9-NEXT: v_readlane_b32 s30, v3, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_mov_b32 s33, s19
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: call_call_without_private_to_flat_addrspacecast:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s19, s33
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
-; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s16
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_getpc_b64 s[16:17]
-; GFX10-NEXT: s_add_u32 s16, s16, call_without_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s17, s17, call_without_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v3, s30, 0
-; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_writelane_b32 v3, s31, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX10-NEXT: v_readlane_b32 s31, v3, 1
-; GFX10-NEXT: v_readlane_b32 s30, v3, 0
-; GFX10-NEXT: s_mov_b32 s32, s33
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_mov_b32 s33, s19
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
- ret void
-}
+;; tests of addrspacecast in a constant
-define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: call_call_without_private_to_flat_addrspacecast_cc_kernel:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: s_add_u32 s0, s0, s17
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_load_dword s17, s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 s8, s8, 8
-; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_mov_b32 s13, s15
-; GFX9-NEXT: s_mov_b32 s12, s14
-; GFX9-NEXT: s_getpc_b64 s[14:15]
-; GFX9-NEXT: s_add_u32 s14, s14, call_without_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s15, s15, call_without_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX9-NEXT: s_mov_b32 s14, s16
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s17
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: call_call_without_private_to_flat_addrspacecast_cc_kernel:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_mov_b32 s32, 0
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; GFX10-NEXT: s_add_u32 s0, s0, s17
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_load_dword s17, s[8:9], 0x0
-; GFX10-NEXT: s_add_u32 s8, s8, 8
-; GFX10-NEXT: s_addc_u32 s9, s9, 0
-; GFX10-NEXT: s_mov_b32 s13, s15
-; GFX10-NEXT: s_mov_b32 s12, s14
-; GFX10-NEXT: s_getpc_b64 s[14:15]
-; GFX10-NEXT: s_add_u32 s14, s14, call_without_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s15, s15, call_without_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX10-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX10-NEXT: s_mov_b32 s14, s16
-; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s17
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX10-NEXT: s_endpgm
- call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) #0 {
+ store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8
ret void
}
-;; tests of indirect call, intrinsics, inline asm
-
- at gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+;; tests of intrinsics
-define void @empty() {
-; GFX9-LABEL: empty:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: empty:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 {
+ %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
+ store volatile i32 7, ptr %1, align 4
ret void
}
-define void @also_empty() {
-; GFX9-LABEL: also_empty:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: also_empty:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+define void @calls_intrin_ascast(ptr addrspace(3) %ptr) #0 {
+ %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
+ store volatile i32 7, ptr %1, align 4
ret void
}
-define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) {
-; GFX9-LABEL: indirect_call_known_callees:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: s_add_u32 s0, s0, s17
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_mov_b32 s13, s15
-; GFX9-NEXT: s_mov_b32 s12, s14
-; GFX9-NEXT: s_load_dword s17, s[8:9], 0x0
-; GFX9-NEXT: s_getpc_b64 s[14:15]
-; GFX9-NEXT: s_add_u32 s14, s14, empty at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s15, s15, empty at gotpcrel32@hi+12
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, also_empty at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, also_empty at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[22:23], s[14:15], 0x0
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s14, 1, s17
-; GFX9-NEXT: s_cmp_eq_u32 s14, 1
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: s_cselect_b32 s19, s23, s21
-; GFX9-NEXT: s_cselect_b32 s18, s22, s20
-; GFX9-NEXT: s_add_u32 s8, s8, 8
-; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX9-NEXT: s_mov_b32 s14, s16
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: indirect_call_known_callees:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_mov_b32 s32, 0
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; GFX10-NEXT: s_add_u32 s0, s0, s17
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_mov_b32 s13, s15
-; GFX10-NEXT: s_mov_b32 s12, s14
-; GFX10-NEXT: s_getpc_b64 s[14:15]
-; GFX10-NEXT: s_add_u32 s14, s14, also_empty at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s15, s15, also_empty at gotpcrel32@hi+12
-; GFX10-NEXT: s_getpc_b64 s[18:19]
-; GFX10-NEXT: s_add_u32 s18, s18, empty at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s19, s19, empty at gotpcrel32@hi+12
-; GFX10-NEXT: s_load_dword s17, s[8:9], 0x0
-; GFX10-NEXT: s_load_dwordx2 s[20:21], s[14:15], 0x0
-; GFX10-NEXT: s_load_dwordx2 s[22:23], s[18:19], 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_and_b32 s14, 1, s17
-; GFX10-NEXT: s_cmp_eq_u32 s14, 1
-; GFX10-NEXT: s_mov_b32 s14, s16
-; GFX10-NEXT: s_cselect_b32 s19, s23, s21
-; GFX10-NEXT: s_cselect_b32 s18, s22, s20
-; GFX10-NEXT: s_add_u32 s8, s8, 8
-; GFX10-NEXT: s_addc_u32 s9, s9, 0
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX10-NEXT: s_endpgm
- %fptr = select i1 %cond, ptr @empty, ptr @also_empty
- call void %fptr()
+define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 {
+ call void @calls_intrin_ascast(ptr addrspace(3) %ptr)
ret void
}
-declare i32 @llvm.amdgcn.workgroup.id.x()
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
-define void @use_intrinsic_workitem_id_x() {
-; GFX9-LABEL: use_intrinsic_workitem_id_x:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_and_b32_e32 v2, 0x3ff, v31
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: use_intrinsic_workitem_id_x:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_and_b32_e32 v2, 0x3ff, v31
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %val = call i32 @llvm.amdgcn.workitem.id.x()
- store volatile i32 %val, ptr addrspace(1) null
- ret void
-}
-
-define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
-; GFX9-LABEL: use_intrinsic_workitem_id_x_cc_kernel:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dword v[1:2], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: use_intrinsic_workitem_id_x_cc_kernel:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: global_store_dword v[1:2], v0, off
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_endpgm
- %val = call i32 @llvm.amdgcn.workitem.id.x()
- store volatile i32 %val, ptr addrspace(1) null
- ret void
-}
-
-define void @call_use_intrinsic_workitem_id_x() {
-; GFX9-LABEL: call_use_intrinsic_workitem_id_x:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[16:17]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, use_intrinsic_workitem_id_x at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, use_intrinsic_workitem_id_x at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_writelane_b32 v3, s30, 0
-; GFX9-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: v_readlane_b32 s31, v3, 1
-; GFX9-NEXT: v_readlane_b32 s30, v3, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_mov_b32 s33, s18
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: call_use_intrinsic_workitem_id_x:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s18, s33
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
-; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s16
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_getpc_b64 s[16:17]
-; GFX10-NEXT: s_add_u32 s16, s16, use_intrinsic_workitem_id_x at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s17, s17, use_intrinsic_workitem_id_x at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v3, s30, 0
-; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_writelane_b32 v3, s31, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX10-NEXT: v_readlane_b32 s31, v3, 1
-; GFX10-NEXT: v_readlane_b32 s30, v3, 0
-; GFX10-NEXT: s_mov_b32 s32, s33
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_mov_b32 s33, s18
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- call void @use_intrinsic_workitem_id_x()
- ret void
-}
-
-define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
-; GFX9-LABEL: call_use_intrinsic_workitem_id_x_cc_kernel:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: s_add_u32 s0, s0, s17
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_mov_b32 s13, s15
-; GFX9-NEXT: s_mov_b32 s12, s14
-; GFX9-NEXT: s_getpc_b64 s[14:15]
-; GFX9-NEXT: s_add_u32 s14, s14, use_intrinsic_workitem_id_x at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s15, s15, use_intrinsic_workitem_id_x at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX9-NEXT: s_mov_b32 s14, s16
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: call_use_intrinsic_workitem_id_x_cc_kernel:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_mov_b32 s32, 0
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; GFX10-NEXT: s_add_u32 s0, s0, s17
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_mov_b32 s13, s15
-; GFX10-NEXT: s_mov_b32 s12, s14
-; GFX10-NEXT: s_getpc_b64 s[14:15]
-; GFX10-NEXT: s_add_u32 s14, s14, use_intrinsic_workitem_id_x at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s15, s15, use_intrinsic_workitem_id_x at gotpcrel32@hi+12
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX10-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX10-NEXT: s_mov_b32 s14, s16
-; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX10-NEXT: s_endpgm
- call void @use_intrinsic_workitem_id_x()
- ret void
-}
+; GFX9: attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX10: attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid2.ll
new file mode 100644
index 0000000000000..d9486c5c78223
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid2.ll
@@ -0,0 +1,313 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+
+;
+; None of these functions should have the attribute amdgpu-no-flat-scratch-init. In these tests
+; we manually set the attribute for the functions. The purpose is to test how llc handles this.
+;
+
+;; tests of addrspacecast
+
+define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
+; GFX9-LABEL: with_private_to_flat_addrspacecast:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: with_private_to_flat_addrspacecast:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0
+; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
+; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+ store volatile i32 0, ptr %stof
+ ret void
+}
+
+define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 {
+; GFX9-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s2, -1
+; GFX9-NEXT: s_cselect_b32 s0, s1, 0
+; GFX9-NEXT: s_cselect_b32 s1, s2, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_cmp_lg_u32 s2, -1
+; GFX10-NEXT: s_cselect_b32 s0, s2, 0
+; GFX10-NEXT: s_cselect_b32 s1, s1, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+ %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+ store volatile i32 0, ptr %stof
+ ret void
+}
+
+define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
+; GFX9-LABEL: call_with_private_to_flat_addrspacecast:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s18, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[16:17]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[16:17]
+; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT: v_writelane_b32 v3, s30, 0
+; GFX9-NEXT: v_writelane_b32 v3, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: v_readlane_b32 s31, v3, 1
+; GFX9-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b32 s33, s18
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: call_with_private_to_flat_addrspacecast:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s18, s33
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
+; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s16
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_getpc_b64 s[16:17]
+; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v3, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX10-NEXT: v_writelane_b32 v3, s31, 1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s31, v3, 1
+; GFX10-NEXT: v_readlane_b32 s30, v3, 0
+; GFX10-NEXT: s_mov_b32 s32, s33
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_mov_b32 s33, s18
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+ ret void
+}
+
+define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 {
+; GFX9-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_u32 s0, s0, s15
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 s8, s8, 8
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: s_getpc_b64 s[16:17]
+; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s15
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s0, s0, s15
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0
+; GFX10-NEXT: s_add_u32 s8, s8, 8
+; GFX10-NEXT: s_addc_u32 s9, s9, 0
+; GFX10-NEXT: s_getpc_b64 s[16:17]
+; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT: s_mov_b32 s32, 0
+; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s15
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: s_endpgm
+ call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+ ret void
+}
+
+;; tests of addrspacecast in a constant
+
+define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) #0 {
+; GFX9-LABEL: private_constant_expression_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: private_constant_expression_use:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+ store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8
+ ret void
+}
+
+;; tests of intrinsics
+
+define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 {
+; GFX9-LABEL: calls_intrin_ascast_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, 7
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: calls_intrin_ascast_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX10-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX10-NEXT: v_mov_b32_e32 v2, 7
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+ %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
+ store volatile i32 7, ptr %1, align 4
+ ret void
+}
+
+define void @calls_intrin_ascast(ptr addrspace(3) %ptr) #0 {
+; GFX9-LABEL: calls_intrin_ascast:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, 7
+; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: calls_intrin_ascast:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX10-NEXT: v_mov_b32_e32 v2, 7
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
+ store volatile i32 7, ptr %1, align 4
+ ret void
+}
+
+define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 {
+; GFX9-LABEL: call_calls_intrin_ascast_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_u32 s0, s0, s15
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 s8, s8, 8
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: s_getpc_b64 s[16:17]
+; GFX9-NEXT: s_add_u32 s16, s16, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s15
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: call_calls_intrin_ascast_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s0, s0, s15
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0
+; GFX10-NEXT: s_add_u32 s8, s8, 8
+; GFX10-NEXT: s_addc_u32 s9, s9, 0
+; GFX10-NEXT: s_getpc_b64 s[16:17]
+; GFX10-NEXT: s_add_u32 s16, s16, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast at gotpcrel32@hi+12
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT: s_mov_b32 s32, 0
+; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s15
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: s_endpgm
+ call void @calls_intrin_ascast(ptr addrspace(3) %ptr)
+ ret void
+}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index 6565dd5270b15..c167834470e3b 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -5,6 +5,9 @@
define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
; CHECK-LABEL: _Z11test_kernelPii:
; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-NEXT: s_add_i32 s12, s12, s17
+; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s0, 3
@@ -50,5 +53,3 @@ if.then: ; preds = %entry
if.end: ; preds = %if.then, %entry
ret void
}
-
-attributes #5 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
index a970a0c750c02..fc17d9288bf40 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
@@ -3,9 +3,11 @@
;
; This code is used to trigger the following dag node, with different return type and vector element type: i16 extract_vec_elt <N x i8> v, 0
-define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) #0 {
+define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) {
; CHECK-LABEL: eggs:
; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0
; CHECK-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8
; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -99,5 +101,3 @@ bb41: ; preds = %bb10, %bb
store <1 x i8> %tmp42, ptr %arg9
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index 73640f2065012..e6f02295e67d5 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -90,10 +90,13 @@ bb:
ret i32 %i9
}
-define amdgpu_kernel void @s_add_co_br_user(i32 %i) #0 {
+define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-LABEL: s_add_co_br_user:
; GFX7: ; %bb.0: ; %bb
; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s0, s2, s2
; GFX7-NEXT: s_cmp_lt_u32 s0, s2
@@ -213,5 +216,3 @@ bb1:
store volatile i32 10, ptr addrspace(1) null
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 761ba28ca2557..fac9f5bf826a6 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -5,6 +5,9 @@
define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v1i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -18,6 +21,9 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -32,6 +38,9 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i
define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v2i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -54,6 +63,9 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -80,6 +92,9 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i
define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v3i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -102,6 +117,9 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -128,6 +146,9 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i
define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v4i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -150,6 +171,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -176,6 +200,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i
define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v8i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s0, s[8:9], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s0, 16
@@ -192,10 +219,13 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
; VI-LABEL: extract_vector_elt_v8i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v3
@@ -213,6 +243,9 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v16i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x4
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -235,6 +268,9 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -261,6 +297,9 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x
define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v32i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s0, s[8:9], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s0, 16
@@ -277,10 +316,13 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
; VI-LABEL: extract_vector_elt_v32i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v3
@@ -298,6 +340,9 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v64i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x10
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -320,6 +365,9 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x40
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -351,6 +399,9 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x
define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v2i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s2, s[8:9], 0xa
; SI-NEXT: s_load_dword s3, s[8:9], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -370,11 +421,14 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out
; VI-NEXT: s_load_dword s2, s[8:9], 0x4c
; VI-NEXT: s_load_dword s3, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s2, s2, 3
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_lshr_b32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -388,6 +442,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v3i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s2, s[8:9], 0x13
; SI-NEXT: s_load_dword s3, s[8:9], 0xa
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -406,10 +463,13 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out
; VI-NEXT: s_load_dword s2, s[8:9], 0x4c
; VI-NEXT: s_load_dword s3, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s2, s2, 3
; VI-NEXT: s_lshr_b32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -424,6 +484,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out
define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v4i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; SI-NEXT: s_load_dword s4, s[8:9], 0xc
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -442,6 +505,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -463,6 +529,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out
define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v8i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; SI-NEXT: s_load_dword s4, s[8:9], 0x4
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -481,6 +550,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -502,6 +574,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
; SI-LABEL: reduce_load_vector_v8i8_extract_0123:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -526,6 +601,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -558,6 +636,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
; SI-LABEL: reduce_load_vector_v8i8_extract_0145:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -581,6 +662,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s2, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -612,6 +696,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
; SI-LABEL: reduce_load_vector_v8i8_extract_45:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 4
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -628,6 +715,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 4
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -649,6 +739,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
; SI-LABEL: reduce_load_vector_v16i8_extract_0145:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -672,6 +765,9 @@ define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s2, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -700,4 +796,4 @@ define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
ret void
}
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init"}
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 055932a880fc4..2957d0201c223 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -9,11 +9,14 @@
; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
; unless isFabsFree returns true
-define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
+define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; CI-LABEL: s_fabs_free_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -26,6 +29,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -75,11 +81,14 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
ret void
}
-define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) #0 {
+define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
; CI-LABEL: s_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -92,6 +101,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -140,11 +152,14 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) #0 {
ret void
}
-define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; CI-LABEL: s_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -157,6 +172,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -192,10 +210,13 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
ret void
}
-define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) #0 {
+define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; CI-LABEL: s_fabs_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
@@ -209,6 +230,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) #
; VI-LABEL: s_fabs_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
@@ -247,10 +271,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) #
ret void
}
-define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) #0 {
+define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) {
; CI-LABEL: fabs_fold_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -268,6 +295,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s3
@@ -322,11 +352,14 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
ret void
}
-define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 #0 {
+define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; CI-LABEL: v_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -341,6 +374,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -388,6 +424,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -400,6 +439,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -443,6 +485,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
@@ -469,6 +514,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -525,9 +573,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v0, v[0:1]
; CI-NEXT: s_lshr_b32 s2, s4, 16
@@ -553,9 +604,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -612,6 +666,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -635,6 +692,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -722,6 +782,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -740,6 +803,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -799,5 +865,6 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init"}
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index c8a9f56ac6089..60334e46a4454 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -74,6 +74,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-LABEL: global_store_2xi16_align2:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -90,6 +93,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -216,8 +222,10 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1
-; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -227,6 +235,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2
; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5
; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3
@@ -243,6 +252,9 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -351,6 +363,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-LABEL: global_store_2xi16_align4:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -361,6 +376,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -407,7 +425,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
ret void
}
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index c1752f2623a3f..9919497acea73 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -24,6 +24,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
; GFX678-LABEL: v_test_canonicalize_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -76,6 +79,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s2, s[8:9], 0x2
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX6-NEXT: v_mov_b32_e32 v0, s0
@@ -87,6 +93,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -132,6 +141,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fabs_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -184,6 +196,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -237,6 +252,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fneg_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -289,6 +307,9 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
; GFX678-LABEL: test_fold_canonicalize_undef_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -328,6 +349,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -367,6 +391,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_bfrev_b32_e32 v2, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -409,6 +436,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 1.0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -449,6 +479,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, -1.0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -489,6 +522,9 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
; GFX678-LABEL: test_fold_canonicalize_literal_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -529,6 +565,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -568,10 +607,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: v_mov_b32_e32 v1, s1
; GFX678-NEXT: flat_store_dword v[0:1], v2
; GFX678-NEXT: s_endpgm
@@ -612,10 +654,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: v_mov_b32_e32 v1, s1
; GFX678-NEXT: flat_store_dword v[0:1], v2
; GFX678-NEXT: s_endpgm
@@ -656,10 +701,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: v_mov_b32_e32 v1, s1
; GFX678-NEXT: flat_store_dword v[0:1], v2
; GFX678-NEXT: s_endpgm
@@ -700,6 +748,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -740,6 +791,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_bfrev_b32_e32 v2, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -782,6 +836,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -822,6 +879,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out
; GFX678-LABEL: test_fold_canonicalize_qnan_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -862,6 +922,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -902,6 +965,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -942,6 +1008,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -982,6 +1051,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1022,6 +1094,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1062,6 +1137,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1102,6 +1180,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1
; GFX678-LABEL: v_test_canonicalize_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1153,6 +1234,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
; GFX6-LABEL: s_test_canonicalize_var_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3]
; GFX6-NEXT: v_mov_b32_e32 v0, s0
@@ -1163,6 +1247,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
; GFX8-LABEL: s_test_canonicalize_var_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s0
@@ -1205,6 +1292,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fabs_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1257,6 +1347,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1
; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1310,6 +1403,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fneg_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1362,10 +1458,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, v0
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, v0
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1407,10 +1506,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1450,10 +1552,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1491,10 +1596,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1532,10 +1640,13 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
; GFX678-LABEL: test_fold_canonicalize_literal_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1573,10 +1684,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, v0
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, v0
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1618,10 +1732,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, -1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1662,10 +1779,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1705,10 +1825,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, -1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1749,10 +1872,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
; GFX678-LABEL: test_fold_canonicalize_qnan_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1790,10 +1916,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1831,10 +1960,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1872,10 +2004,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1913,10 +2048,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1954,10 +2092,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1995,10 +2136,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -2037,6 +2181,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2054,6 +2201,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2117,6 +2267,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2134,6 +2287,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2197,6 +2353,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2215,6 +2374,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2279,6 +2441,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2302,6 +2467,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2368,6 +2536,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2385,6 +2556,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2448,6 +2622,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2465,6 +2642,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2529,6 +2709,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2547,6 +2730,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2612,6 +2798,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2635,6 +2824,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2700,6 +2892,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -2717,6 +2912,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -3069,10 +3267,10 @@ define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 {
}
attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-no-flat-scratch-init" }
-attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-no-flat-scratch-init" }
-attributes #3 = { nounwind "denormal-fp-math"="ieee,ieee" "amdgpu-no-flat-scratch-init" }
-attributes #4 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-no-flat-scratch-init" }
-attributes #5 = { nounwind "denormal-fp-math-f32"="dynamic,dynamic" "amdgpu-no-flat-scratch-init" }
-attributes #6 = { nounwind "denormal-fp-math-f32"="dynamic,ieee" "amdgpu-no-flat-scratch-init" }
-attributes #7 = { nounwind "denormal-fp-math-f32"="ieee,dynamic" "amdgpu-no-flat-scratch-init" }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
+attributes #3 = { nounwind "denormal-fp-math"="ieee,ieee" }
+attributes #4 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
+attributes #5 = { nounwind "denormal-fp-math-f32"="dynamic,dynamic" }
+attributes #6 = { nounwind "denormal-fp-math-f32"="dynamic,ieee" }
+attributes #7 = { nounwind "denormal-fp-math-f32"="ieee,dynamic" }
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 10864507ee456..fb2448fb80744 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -16,6 +16,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
; VI-LABEL: multiple_fadd_use_test_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f32_e64 v0, s3, -1.0
; VI-NEXT: v_add_f32_e64 v1, s2, -1.0
@@ -80,8 +83,11 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-NEXT: s_load_dword s3, s[8:9], 0x2c
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_add_u32 s2, s0, 4
; VI-NEXT: v_add_f32_e64 v2, s4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -139,6 +145,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
; VI-LABEL: multiple_use_fadd_fmad_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s4, s0, 4
@@ -194,6 +203,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s6, s4, 4
; VI-NEXT: v_mov_b32_e32 v0, s1
@@ -255,6 +267,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0
; VI-NEXT: v_mul_f32_e32 v2, s2, v0
@@ -303,10 +318,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-NEXT: v_mul_f32_e32 v2, s2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -350,6 +368,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
; VI-DENORM: ; %bb.0:
; VI-DENORM-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
+; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16
; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0
@@ -368,6 +389,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
; VI-FLUSH: ; %bb.0:
; VI-FLUSH-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
+; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16
; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0
@@ -482,6 +506,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
; VI-DENORM: ; %bb.0:
; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
+; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
@@ -503,6 +530,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
; VI-FLUSH: ; %bb.0:
; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
+; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
@@ -599,6 +629,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
; VI-DENORM: ; %bb.0:
; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
+; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
@@ -620,6 +653,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
; VI-FLUSH: ; %bb.0:
; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
+; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3
@@ -718,6 +754,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; VI-DENORM-NEXT: s_load_dword s6, s[8:9], 0x8
+; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
+; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
@@ -725,6 +763,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s1
; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
+; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-DENORM-NEXT: s_add_u32 s4, s2, 2
; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0
@@ -741,6 +780,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; VI-FLUSH-NEXT: s_load_dword s6, s[8:9], 0x8
+; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
+; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
@@ -748,6 +789,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s1
; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
+; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2
; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0
@@ -847,6 +889,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0
; VI-NEXT: v_mul_f16_e32 v2, s2, v0
@@ -898,10 +943,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 0xc600
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f16_e32 v0, s2, v0
; VI-NEXT: v_mul_f16_e32 v2, s2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -943,5 +991,5 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
ret void
}
-attributes #0 = { nounwind "unsafe-fp-math"="true" "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind "unsafe-fp-math"="true" }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index ec57d1ea3d8d2..eb9eb42df4c78 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -5,10 +5,13 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
-define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) #0 {
+define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) {
; CI-LABEL: fneg_fabs_fadd_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -26,6 +29,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -81,10 +87,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
ret void
}
-define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) #0 {
+define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) {
; CI-LABEL: fneg_fabs_fmul_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s1, s0, 0x7fff
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -103,6 +112,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -161,11 +173,14 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
; DAGCombiner will transform:
; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
; unless isFabsFree returns true
-define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
+define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; CI-LABEL: fneg_fabs_free_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_bitset1_b32 s2, 15
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -178,6 +193,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s2, 15
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -228,11 +246,14 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) #0
ret void
}
-define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) #0 {
+define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
; CI-LABEL: fneg_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_bitset1_b32 s2, 15
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -245,6 +266,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s2, 15
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -294,10 +318,13 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) #0 {
ret void
}
-define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; CIVI-LABEL: v_fneg_fabs_f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -348,10 +375,13 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
ret void
}
-define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) {
; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
@@ -374,7 +404,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0x4000
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v2, s3
@@ -383,6 +415,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -420,11 +453,14 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; FIXME: single bit op
; Combine turns this into integer op when bitcast source (from load)
-define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) {
; CI-LABEL: s_fneg_fabs_v2f16_bc_src:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_or_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -437,6 +473,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_or_b32 s2, s2, 0x80008000
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -473,10 +512,13 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
ret void
}
-define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) #0 {
+define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; CIVI-LABEL: fneg_fabs_v4f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000
; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000
@@ -520,6 +562,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; CI-LABEL: fold_user_fneg_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s0, 16
; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
@@ -541,7 +586,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0xc400
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v2, s3
@@ -549,6 +596,7 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -583,11 +631,14 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
ret void
}
-define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) #0 {
+define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) {
; CI-LABEL: s_fneg_multi_use_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
@@ -605,6 +656,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
@@ -654,11 +708,14 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
ret void
}
-define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) #0 {
+define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) {
; CI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010
@@ -683,7 +740,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v5, 0xc400
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_lshr_b32 s1, s4, 16
@@ -692,6 +751,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
; VI-NEXT: v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_mul_f16_e64 v5, |s4|, -4.0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_or_b32_e32 v4, v5, v4
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -739,5 +799,5 @@ declare half @llvm.fabs.f16(half) #1
declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 0b97403968193..058c273a65d99 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -3,7 +3,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
-define i32 @fneg_xor_select_i32(i1 %cond, i32 %arg0, i32 %arg1) #1 {
+define i32 @fneg_xor_select_i32(i1 %cond, i32 %arg0, i32 %arg1) {
; GCN-LABEL: fneg_xor_select_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -25,7 +25,7 @@ define i32 @fneg_xor_select_i32(i1 %cond, i32 %arg0, i32 %arg1) #1 {
ret i32 %fneg
}
-define <2 x i32> @fneg_xor_select_v2i32(<2 x i1> %cond, <2 x i32> %arg0, <2 x i32> %arg1) #1 {
+define <2 x i32> @fneg_xor_select_v2i32(<2 x i1> %cond, <2 x i32> %arg0, <2 x i32> %arg1) {
; GCN-LABEL: fneg_xor_select_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -53,7 +53,7 @@ define <2 x i32> @fneg_xor_select_v2i32(<2 x i1> %cond, <2 x i32> %arg0, <2 x i3
ret <2 x i32> %fneg
}
-define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr addrspace(1) %ptr) #1 {
+define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr addrspace(1) %ptr) {
; GFX7-LABEL: fneg_xor_select_i32_multi_use:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -92,7 +92,7 @@ define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr ad
ret i32 %fneg
}
-define i64 @fneg_xor_select_i64(i1 %cond, i64 %arg0, i64 %arg1) #1 {
+define i64 @fneg_xor_select_i64(i1 %cond, i64 %arg0, i64 %arg1) {
; GCN-LABEL: fneg_xor_select_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -116,7 +116,7 @@ define i64 @fneg_xor_select_i64(i1 %cond, i64 %arg0, i64 %arg1) #1 {
ret i64 %fneg
}
-define <2 x i64> @fneg_xor_select_v2i64(<2 x i1> %cond, <2 x i64> %arg0, <2 x i64> %arg1) #1 {
+define <2 x i64> @fneg_xor_select_v2i64(<2 x i1> %cond, <2 x i64> %arg0, <2 x i64> %arg1) {
; GCN-LABEL: fneg_xor_select_v2i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -148,7 +148,7 @@ define <2 x i64> @fneg_xor_select_v2i64(<2 x i1> %cond, <2 x i64> %arg0, <2 x i6
ret <2 x i64> %fneg
}
-define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) #1 {
+define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) {
; GCN-LABEL: fneg_xor_select_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -172,7 +172,7 @@ define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) #1 {
ret i16 %fneg
}
-define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i16> %arg1) #1 {
+define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i16> %arg1) {
; GFX7-LABEL: fneg_xor_select_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -227,7 +227,7 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1
ret <2 x i16> %fneg
}
-define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr addrspace(1) %ptr) #1 {
+define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr addrspace(1) %ptr) {
; GFX7-LABEL: fneg_xor_select_i16_multi_use:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -266,7 +266,7 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
ret i16 %fneg
}
-define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr addrspace(1) %ptr) #1 {
+define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr addrspace(1) %ptr) {
; GFX7-LABEL: fneg_xor_select_i64_multi_user:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -310,7 +310,7 @@ define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr a
ret i64 %fneg
}
-define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg1) #1 {
+define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg1) {
; GCN-LABEL: select_fneg_xor_select_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -346,7 +346,7 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
ret i32 %select1
}
-define float @select_fneg_select_f32(i1 %cond0, i1 %cond1, float %arg0, float %arg1) #1 {
+define float @select_fneg_select_f32(i1 %cond0, i1 %cond1, float %arg0, float %arg1) {
; GCN-LABEL: select_fneg_select_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -377,7 +377,7 @@ define float @select_fneg_select_f32(i1 %cond0, i1 %cond1, float %arg0, float %a
ret float %select1
}
-define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) #1 {
+define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
; GCN-LABEL: fneg_xor_select_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -401,7 +401,7 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) #1 {
ret double %fneg
}
-define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %arg1, ptr addrspace(1) %ptr) #1 {
+define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %arg1, ptr addrspace(1) %ptr) {
; GFX7-LABEL: fneg_xor_select_f64_multi_user:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -446,7 +446,7 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
ret double %fneg
}
-define double @fneg_xor_select_i64_user_with_srcmods(i1 %cond, i64 %arg0, i64 %arg1) #1 {
+define double @fneg_xor_select_i64_user_with_srcmods(i1 %cond, i64 %arg0, i64 %arg1) {
; GCN-LABEL: fneg_xor_select_i64_user_with_srcmods:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -473,7 +473,7 @@ define double @fneg_xor_select_i64_user_with_srcmods(i1 %cond, i64 %arg0, i64 %a
ret double %add
}
-define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, double %arg1) #1 {
+define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, double %arg1) {
; GCN-LABEL: select_fneg_select_fneg_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -511,7 +511,7 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
ret double %select1
}
-define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg1) #1 {
+define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg1) {
; GCN-LABEL: select_fneg_xor_select_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -549,7 +549,7 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
ret i64 %select1
}
-define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1) #1 {
+define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1) {
; GFX7-LABEL: select_fneg_select_f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -600,7 +600,7 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1
ret half %select1
}
-define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg1) #1 {
+define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg1) {
; GCN-LABEL: select_fneg_xor_select_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -636,7 +636,7 @@ define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg
ret i16 %select1
}
-define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 x half> %arg0, <2 x half> %arg1) #1 {
+define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 x half> %arg0, <2 x half> %arg1) {
; GFX7-LABEL: select_fneg_select_v2f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -733,7 +733,7 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2
ret <2 x half> %select1
}
-define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, <2 x i16> %arg0, <2 x i16> %arg1) #1 {
+define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, <2 x i16> %arg0, <2 x i16> %arg1) {
; GFX7-LABEL: select_fneg_xor_select_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -820,7 +820,7 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1,
; pattern that appeared in rocm-device-libs to manually operate on the
; sign bit of the high half of a double
-define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) #1 {
+define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
; GCN-LABEL: cospiD_pattern0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -861,7 +861,7 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) #1 {
ret double %i11
}
-define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) #1 {
+define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
; GCN-LABEL: cospiD_pattern1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -899,7 +899,7 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) #1 {
}
; artifical example, scaled to operation on 16-bit halves of a float.
-define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) #1 {
+define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
; GFX7-LABEL: cospiD_pattern0_half:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -958,7 +958,7 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) #1 {
ret float %i11
}
-define float @cospiD_pattern1_half(i16 %arg, float %arg1, float %arg2) #1 {
+define float @cospiD_pattern1_half(i16 %arg, float %arg1, float %arg2) {
; GFX7-LABEL: cospiD_pattern1_half:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -999,7 +999,7 @@ define float @cospiD_pattern1_half(i16 %arg, float %arg1, float %arg2) #1 {
ret float %i7
}
-define double @fneg_f64_bitcast_vector_i64_to_f64(i64 %arg) #1 {
+define double @fneg_f64_bitcast_vector_i64_to_f64(i64 %arg) {
; GCN-LABEL: fneg_f64_bitcast_vector_i64_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1016,7 +1016,7 @@ define double @fneg_f64_bitcast_vector_i64_to_f64(i64 %arg) #1 {
ret double %fneg
}
-define double @fneg_f64_bitcast_vector_v2i32_to_f64(<2 x i32> %arg) #1 {
+define double @fneg_f64_bitcast_vector_v2i32_to_f64(<2 x i32> %arg) {
; GCN-LABEL: fneg_f64_bitcast_vector_v2i32_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1033,7 +1033,7 @@ define double @fneg_f64_bitcast_vector_v2i32_to_f64(<2 x i32> %arg) #1 {
ret double %fneg
}
-define double @fneg_f64_bitcast_vector_v2f32_to_f64(<2 x float> %arg) #1 {
+define double @fneg_f64_bitcast_vector_v2f32_to_f64(<2 x float> %arg) {
; GCN-LABEL: fneg_f64_bitcast_vector_v2f32_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1050,7 +1050,7 @@ define double @fneg_f64_bitcast_vector_v2f32_to_f64(<2 x float> %arg) #1 {
ret double %fneg
}
-define double @fneg_f64_bitcast_vector_v4i16_to_f64(<4 x i16> %arg) #1 {
+define double @fneg_f64_bitcast_vector_v4i16_to_f64(<4 x i16> %arg) {
; GFX7-LABEL: fneg_f64_bitcast_vector_v4i16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1079,7 +1079,7 @@ define double @fneg_f64_bitcast_vector_v4i16_to_f64(<4 x i16> %arg) #1 {
ret double %fneg
}
-define double @fneg_f64_bitcast_vector_v4f16_to_f64(<4 x half> %arg) #1 {
+define double @fneg_f64_bitcast_vector_v4f16_to_f64(<4 x half> %arg) {
; GFX7-LABEL: fneg_f64_bitcast_vector_v4f16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1110,7 +1110,7 @@ define double @fneg_f64_bitcast_vector_v4f16_to_f64(<4 x half> %arg) #1 {
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v2i32_to_f64(i32 %elt0, i32 %elt1) #1 {
+define double @fneg_f64_bitcast_build_vector_v2i32_to_f64(i32 %elt0, i32 %elt1) {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1129,7 +1129,7 @@ define double @fneg_f64_bitcast_build_vector_v2i32_to_f64(i32 %elt0, i32 %elt1)
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v2f32_to_f64(float %elt0, float %elt1) #1 {
+define double @fneg_f64_bitcast_build_vector_v2f32_to_f64(float %elt0, float %elt1) {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1148,7 +1148,7 @@ define double @fneg_f64_bitcast_build_vector_v2f32_to_f64(float %elt0, float %el
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v4i16_to_f64(i16 %elt0, i16 %elt1, i16 %elt2, i16 %elt3) #1 {
+define double @fneg_f64_bitcast_build_vector_v4i16_to_f64(i16 %elt0, i16 %elt1, i16 %elt2, i16 %elt3) {
; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4i16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1187,7 +1187,7 @@ define double @fneg_f64_bitcast_build_vector_v4i16_to_f64(i16 %elt0, i16 %elt1,
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1, half %elt2, half %elt3) #1 {
+define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1, half %elt2, half %elt3) {
; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4f16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1228,7 +1228,7 @@ define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat %elt1, bfloat %elt2, bfloat %elt3) #1 {
+define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat %elt1, bfloat %elt2, bfloat %elt3) {
; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1269,7 +1269,7 @@ define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user(i32 %elt0, i32 %elt1, double %fp.val) #1 {
+define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user(i32 %elt0, i32 %elt1, double %fp.val) {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1289,7 +1289,7 @@ define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user(i32 %elt
ret double %fmul
}
-define { double, double } @fneg_f64_bitcast_build_vector_v2i32_to_f64_multi_modifier_user(i32 %elt0, i32 %elt1, double %fp.val0, double %fp.val1) #1 {
+define { double, double } @fneg_f64_bitcast_build_vector_v2i32_to_f64_multi_modifier_user(i32 %elt0, i32 %elt1, double %fp.val0, double %fp.val1) {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64_multi_modifier_user:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1318,7 +1318,7 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2i32_to_f64_multi_modi
ret { double, double } %ret.1
}
-define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user_integer_neg_source(i32 %elt0, i32 %elt1, double %fp.val) #1 {
+define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user_integer_neg_source(i32 %elt0, i32 %elt1, double %fp.val) {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user_integer_neg_source:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1342,7 +1342,7 @@ define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user_integer_
ret double %fmul
}
-define double @fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64(float %elt0, float %elt1) #1 {
+define double @fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64(float %elt0, float %elt1) {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1362,7 +1362,7 @@ define double @fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64(float
ret double %fneg
}
-define double @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user(float %elt0, float %elt1, ptr addrspace(1) %ptr) #1 {
+define double @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user(float %elt0, float %elt1, ptr addrspace(1) %ptr) {
; GFX7-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1394,7 +1394,7 @@ define double @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user(fl
ret double %fneg
}
-define { double, <2 x float> } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_foldable_user(float %elt0, float %elt1, <2 x float> %arg.v2f32) #1 {
+define { double, <2 x float> } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_foldable_user(float %elt0, float %elt1, <2 x float> %arg.v2f32) {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_foldable_user:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1421,7 +1421,7 @@ define { double, <2 x float> } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitca
ret { double, <2 x float> } %ret.1
}
-define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_user(float %elt0, float %elt1) #1 {
+define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_user(float %elt0, float %elt1) {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_user:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1446,7 +1446,7 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_us
ret { double, double } %ret.1
}
-define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_foldable_user(float %elt0, float %elt1, double %arg.f64) #1 {
+define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_foldable_user(float %elt0, float %elt1, double %arg.f64) {
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_foldable_user:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1471,12 +1471,14 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_fo
}
; Check for correct bitcasting back when there are multiple uses
-define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i1 %z, ptr addrspace(1) %dst) #1 {
+define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i1 %z, ptr addrspace(1) %dst) {
; GFX7-LABEL: multiple_uses_fneg_select_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x4
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x6
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_bitcmp1_b32 s6, 0
; GFX7-NEXT: s_cselect_b64 vcc, -1, 0
@@ -1488,6 +1490,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
; GFX7-NEXT: s_cselect_b32 s0, s0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v3, s5
@@ -1543,7 +1546,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
ret void
}
-define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) #1 {
+define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) {
; GCN-LABEL: fnge_select_f32_multi_use_regression:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
@@ -1598,5 +1601,3 @@ bb5: ; preds = %bb, %.entry
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #0
attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
-
-attributes #1 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 92a349a4db19f..98e0b27cd955d 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -11,6 +11,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x8000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -23,6 +26,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -78,6 +84,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -92,6 +101,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -152,6 +164,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x8000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -164,6 +179,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -217,6 +235,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
; CI-LABEL: v_fneg_fold_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -234,6 +255,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: v_fneg_fold_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -289,6 +313,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -301,6 +328,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -340,14 +370,17 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 {
; CIVI-LABEL: s_fneg_v2f16_nonload:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
; CIVI-NEXT: ;;#ASMSTART
; CIVI-NEXT: ; def s2
; CIVI-NEXT: ;;#ASMEND
; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000
-; CIVI-NEXT: v_mov_b32_e32 v2, s2
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: v_mov_b32_e32 v1, s1
+; CIVI-NEXT: v_mov_b32_e32 v2, s2
; CIVI-NEXT: flat_store_dword v[0:1], v2
; CIVI-NEXT: s_endpgm
;
@@ -388,6 +421,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -402,6 +438,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -449,6 +488,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -461,6 +503,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -501,6 +546,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; CI-LABEL: v_fneg_fold_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -527,6 +575,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; GFX8-LABEL: v_fneg_fold_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -572,6 +623,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
; CI-LABEL: v_extract_fneg_fold_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -593,6 +647,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
; GFX8-LABEL: v_extract_fneg_fold_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -672,6 +729,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0
; CIVI-LABEL: v_extract_fneg_no_fold_v2f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
@@ -723,5 +783,5 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 5915f49658e55..10573aad38a51 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -10,6 +10,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -21,6 +24,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -46,6 +52,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -57,6 +66,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -81,6 +93,9 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
; CIVI-LABEL: load_v3f16_arg:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_add_u32 s4, s0, 4
; CIVI-NEXT: s_addc_u32 s5, s1, 0
@@ -114,6 +129,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
; CIVI-LABEL: load_v4f16_arg:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v2, s2
@@ -139,6 +157,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -153,6 +174,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -183,6 +207,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s3, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -196,6 +223,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -227,6 +257,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -238,6 +271,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -265,6 +301,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s3, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -278,6 +317,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -308,6 +350,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
; CI-LABEL: extload_v3f16_to_v3f32_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
@@ -321,6 +366,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
; VI-LABEL: extload_v3f16_to_v3f32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
@@ -351,6 +399,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; CI-LABEL: extload_v4f16_to_v4f32_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s3, 16
; CI-NEXT: s_lshr_b32 s5, s2, 16
@@ -366,6 +417,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; VI-LABEL: extload_v4f16_to_v4f32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s3, 16
; VI-NEXT: s_lshr_b32 s5, s2, 16
@@ -401,6 +455,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s6, s1, 16
; CI-NEXT: s_lshr_b32 s7, s0, 16
@@ -429,6 +486,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s6, s1, 16
; VI-NEXT: s_lshr_b32 s7, s0, 16
@@ -485,6 +545,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
; CI-LABEL: extload_f16_to_f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -498,6 +561,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
; VI-LABEL: extload_f16_to_f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -529,6 +595,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
; CI-LABEL: extload_v2f16_to_v2f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s1
@@ -545,6 +614,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
; VI-LABEL: extload_v2f16_to_v2f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s1
@@ -582,6 +654,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
; CI-LABEL: extload_v3f16_to_v3f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
; CI-NEXT: s_lshr_b32 s4, s2, 16
@@ -603,6 +678,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
; VI-LABEL: extload_v3f16_to_v3f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
; VI-NEXT: s_lshr_b32 s4, s2, 16
@@ -648,6 +726,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
; CI-LABEL: extload_v4f16_to_v4f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s3, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
@@ -673,6 +754,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
; VI-LABEL: extload_v4f16_to_v4f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s5, s3, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s3
@@ -726,6 +810,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s6, s3, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s6
@@ -773,6 +860,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s6, s0, 16
; VI-NEXT: s_lshr_b32 s8, s2, 16
@@ -858,6 +948,9 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr
; CIVI-LABEL: global_load_store_f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -886,6 +979,9 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: global_load_store_v2f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -914,6 +1010,9 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add
; CIVI-LABEL: global_load_store_v4f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
@@ -942,6 +1041,9 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: global_load_store_v8f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -970,6 +1072,9 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr
; CIVI-LABEL: global_extload_f16_to_f32:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -1001,6 +1106,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v2f16_to_v2f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1017,6 +1125,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v2f16_to_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1052,6 +1163,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v3f16_to_v3f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1069,6 +1183,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v3f16_to_v3f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1106,6 +1223,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v4f16_to_v4f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1125,6 +1245,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v4f16_to_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1165,6 +1288,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v8f16_to_v8f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1195,6 +1321,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v8f16_to_v8f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1251,6 +1380,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; CI-LABEL: global_extload_v16f16_to_v16f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s4, s2, 16
; CI-NEXT: v_mov_b32_e32 v5, s3
@@ -1309,6 +1441,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; VI-LABEL: global_extload_v16f16_to_v16f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1406,6 +1541,9 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr
; CIVI-LABEL: global_extload_f16_to_f64:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -1440,6 +1578,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v2f16_to_v2f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1458,6 +1599,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v2f16_to_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1498,6 +1642,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v3f16_to_v3f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1523,6 +1670,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v3f16_to_v3f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1574,6 +1724,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v4f16_to_v4f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1602,6 +1755,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v4f16_to_v4f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1659,6 +1815,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v8f16_to_v8f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1707,6 +1866,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v8f16_to_v8f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1791,6 +1953,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; CI-LABEL: global_extload_v16f16_to_v16f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1885,6 +2050,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; VI-LABEL: global_extload_v16f16_to_v16f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2039,6 +2207,9 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p
; CIVI-LABEL: global_truncstore_f32_to_f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -2070,6 +2241,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v2f32_to_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2087,6 +2261,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v2f32_to_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2123,6 +2300,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v3f32_to_v3f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2146,6 +2326,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v3f32_to_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2191,6 +2374,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v4f32_to_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2212,6 +2398,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v4f32_to_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2254,6 +2443,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v8f32_to_v8f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2289,6 +2481,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v8f32_to_v8f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2352,6 +2547,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; CI-LABEL: global_truncstore_v16f32_to_v16f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s4, s2, 32
; CI-NEXT: s_addc_u32 s5, s3, 0
@@ -2420,6 +2618,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; VI-LABEL: global_truncstore_v16f32_to_v16f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s2, 32
; VI-NEXT: s_addc_u32 s5, s3, 0
@@ -2530,6 +2731,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
; CI-LABEL: fadd_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -2547,6 +2751,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s3
@@ -2577,6 +2784,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
; CI-LABEL: fadd_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
@@ -2598,6 +2808,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
; VI-LABEL: fadd_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s3, 16
; VI-NEXT: s_lshr_b32 s5, s2, 16
@@ -2629,6 +2842,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-LABEL: fadd_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2666,6 +2882,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: fadd_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2706,6 +2925,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s10, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v4, s0
@@ -2764,6 +2986,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s10, s7, 16
; VI-NEXT: s_lshr_b32 s11, s3, 16
@@ -2824,6 +3049,9 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr
; CIVI-LABEL: test_bitcast_from_half:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
@@ -2853,6 +3081,9 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
; CIVI-LABEL: test_bitcast_to_half:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -2878,4 +3109,4 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
ret void
}
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 60469b25dc28c..024593c49dba1 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -43,7 +43,7 @@
; ELF: 00E0: 6E616D65 A673696D 706C65BB 2E707269
; ELF: 00F0: 76617465 5F736567 6D656E74 5F666978
; ELF: 0100: 65645F73 697A6500 AB2E7367 70725F63
-; ELF: 0110: 6F756E74 06B12E73 6770725F 7370696C
+; ELF: 0110: 6F756E74 0EB12E73 6770725F 7370696C
; ELF: 0120: 6C5F636F 756E7400 A72E7379 6D626F6C
; ELF: 0130: A973696D 706C652E 6B64AB2E 76677072
; ELF: 0140: 5F636F75 6E7403B1 2E766770 725F7370
@@ -59,7 +59,7 @@
; ELF: 01E0: 73696D70 6C655F6E 6F5F6B65 726E6172
; ELF: 01F0: 6773BB2E 70726976 6174655F 7365676D
; ELF: 0200: 656E745F 66697865 645F7369 7A6500AB
-; ELF: 0210: 2E736770 725F636F 756E7400 B12E7367
+; ELF: 0210: 2E736770 725F636F 756E740C B12E7367
; ELF: 0220: 70725F73 70696C6C 5F636F75 6E7400A7
; ELF: 0230: 2E73796D 626F6CB5 73696D70 6C655F6E
; ELF: 0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
@@ -120,7 +120,7 @@ entry:
ret void
}
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index d932b70a67c9f..b51cb9df8d784 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
; VI-LABEL: s_insertelement_v2bf16_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -82,6 +85,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
; VI-LABEL: s_insertelement_v2bf16_1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -144,6 +150,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -216,6 +225,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -286,6 +298,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -358,6 +373,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -435,11 +453,14 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1)
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -531,14 +552,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, s4, v0, v4
@@ -611,14 +635,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, v0, s4, v4
@@ -689,14 +716,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, s4, v1, v4
@@ -769,14 +799,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, v1, s4, v4
@@ -853,9 +886,12 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -922,7 +958,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
; SI-LABEL: v_insertelement_v8bf16_3:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -948,9 +984,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
@@ -1001,7 +1040,7 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
ret void
}
-define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) #0 {
+define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
; SI-LABEL: v_insertelement_v8bf16_dynamic:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1065,9 +1104,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -1216,7 +1258,7 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
ret void
}
-define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
; SI-LABEL: v_insertelement_v16bf16_3:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1245,11 +1287,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -1311,7 +1356,7 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
ret void
}
-define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) #0 {
+define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
; SI-LABEL: v_insertelement_v16bf16_dynamic:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0
@@ -1417,11 +1462,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -1683,5 +1731,5 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 8d84c232ec7f8..2cecbe376520d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -21,6 +21,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2i16_0:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -68,6 +71,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -84,6 +90,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -152,6 +161,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -172,6 +184,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -253,6 +268,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -268,6 +286,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -322,6 +343,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -341,6 +365,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -428,6 +455,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -450,6 +480,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -544,6 +577,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2i16_1:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -590,6 +626,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -606,6 +645,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -669,6 +711,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2f16_0:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -714,6 +759,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2f16_1:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -760,6 +808,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -778,6 +829,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -834,9 +888,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -853,9 +910,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -926,6 +986,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -944,6 +1007,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -999,6 +1065,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1017,6 +1086,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1084,6 +1156,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1102,6 +1177,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1169,6 +1247,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1187,6 +1268,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1241,6 +1325,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1259,6 +1346,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1313,6 +1403,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1331,6 +1424,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1399,6 +1495,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1417,6 +1516,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1491,6 +1593,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s4, s[4:5], 0x0
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1510,6 +1615,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1572,9 +1680,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -1593,9 +1704,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -1658,11 +1772,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1685,11 +1802,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_load_dword v4, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1758,14 +1878,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, s4, v0, v4
@@ -1777,9 +1900,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1851,14 +1977,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, v0, s4, v4
@@ -1870,9 +1999,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1944,14 +2076,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, s4, v1, v4
@@ -1963,9 +2098,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2037,14 +2175,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, v1, s4, v4
@@ -2056,9 +2197,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2130,14 +2274,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, s4, v1, v4
@@ -2149,9 +2296,12 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2229,6 +2379,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
@@ -2256,6 +2409,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: flat_load_dword v4, v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
@@ -2359,9 +2515,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -2385,9 +2544,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2435,7 +2597,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
ret void
}
-define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
; GFX9-LABEL: v_insertelement_v8f16_3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -2454,9 +2616,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
@@ -2474,9 +2639,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -2528,7 +2696,7 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
ret void
}
-define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
; GFX9-LABEL: v_insertelement_v8i16_6:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -2548,9 +2716,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -2568,9 +2739,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -2623,7 +2797,7 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
ret void
}
-define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) #0 {
+define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
; GFX9-LABEL: v_insertelement_v8f16_dynamic:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -2674,9 +2848,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -2728,9 +2905,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -2892,7 +3072,7 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
ret void
}
-define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
; GFX9-LABEL: v_insertelement_v16f16_3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -2914,11 +3094,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -2941,9 +3124,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s3
; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4
@@ -3011,7 +3197,7 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
ret void
}
-define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
; GFX9-LABEL: v_insertelement_v16i16_6:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -3034,12 +3220,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT: v_mov_b32_e32 v12, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -3047,6 +3235,7 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
+; VI-NEXT: v_mov_b32_e32 v12, 0x3020504
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_perm_b32 v3, s4, v3, v12
@@ -3060,11 +3249,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -3133,7 +3325,7 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
ret void
}
-define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) #0 {
+define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
; GFX9-LABEL: v_insertelement_v16f16_dynamic:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -3219,11 +3411,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -3316,11 +3511,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3]
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
@@ -3612,5 +3810,5 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
index 2f7bae7d94a23..5dff7372ab561 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -4,9 +4,12 @@
; Check illegal casts are codegened as poison, and not an error.
-define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) #0 {
+define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) {
; CHECK-LABEL: use_group_to_global_addrspacecast:
; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-NEXT: s_add_i32 s12, s12, s17
+; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: flat_store_dword v[0:1], v0
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -65,5 +68,3 @@ define amdgpu_kernel void @use_42_to_local_addrspacecast(ptr addrspace(42) %ptr)
%load = load volatile i32, ptr addrspace(3) %cast
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
index e71345d0fda7e..55a5d50f06bbd 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
@@ -1,11 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
-define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) #1 {
+define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) {
; CHECK-LABEL: load_idx_idy:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
@@ -35,4 +37,3 @@ entry:
declare noundef nonnull align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-attributes #1 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 1c1e7067d6730..4edd0357c6e7a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -7,7 +7,7 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) #0 {
+define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; SI-LABEL: is_private_vgpr:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -30,9 +30,12 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) #0 {
; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x32
; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -59,10 +62,13 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) #0 {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x32
; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -110,7 +116,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) #0 {
; FIXME: setcc (zero_extend (setcc)), 1) not folded out, resulting in
; select and vcc branch.
-define amdgpu_kernel void @is_private_sgpr(ptr %ptr) #0 {
+define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; SI-LABEL: is_private_sgpr:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s0, s[8:9], 0x1
@@ -133,6 +139,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) #0 {
; CI-SDAG: ; %bb.0:
; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1
; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x32
+; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1
; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -166,6 +175,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) #0 {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x32
+; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0
; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
@@ -228,8 +240,6 @@ bb1:
ret void
}
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
-
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index 5f1a03e575fb5..9d078f7906b4d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -7,7 +7,7 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) #0 {
+define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; CIT-LABEL: is_local_vgpr:
; CIT: ; %bb.0:
; CIT-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -63,9 +63,12 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) #0 {
; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x33
; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -92,10 +95,13 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) #0 {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x33
; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -143,7 +149,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) #0 {
; FIXME: setcc (zero_extend (setcc)), 1) not folded out, resulting in
; select and vcc branch.
-define amdgpu_kernel void @is_local_sgpr(ptr %ptr) #0 {
+define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; CIT-LABEL: is_local_sgpr:
; CIT: ; %bb.0:
; CIT-NEXT: s_load_dword s0, s[6:7], 0x1
@@ -200,6 +206,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) #0 {
; CI-SDAG: ; %bb.0:
; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1
; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x33
+; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1
; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -233,6 +242,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) #0 {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x33
+; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0
; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
@@ -295,8 +307,6 @@ bb1:
ret void
}
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
-
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 839b8df7d249e..0fe371c1b51fe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -20,11 +20,14 @@ define void @function_lds_id(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) #0 !llvm.amdgcn.lds.kernel.id !0 {
+define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
; GCN-LABEL: kernel_lds_id:
; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s2, s12, 42
+; GCN-NEXT: s_add_i32 s2, s14, 42
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
@@ -38,36 +41,45 @@ define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) #0 !llvm.amdgcn.
ret void
}
-define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) #0 !llvm.amdgcn.lds.kernel.id !1 {
+define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !1 {
; GCN-LABEL: indirect_lds_id:
; GCN: ; %bb.0:
; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: s_add_u32 s0, s0, s15
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_load_dwordx2 s[16:17], s[8:9], 0x0
+; GCN-NEXT: s_mov_b32 s13, s15
+; GCN-NEXT: s_mov_b32 s12, s14
+; GCN-NEXT: s_load_dwordx2 s[18:19], s[8:9], 0x0
; GCN-NEXT: s_add_u32 s8, s8, 8
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: s_addc_u32 s9, s9, 0
-; GCN-NEXT: s_getpc_b64 s[18:19]
-; GCN-NEXT: s_add_u32 s18, s18, function_lds_id at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s19, s19, function_lds_id at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0
+; GCN-NEXT: s_getpc_b64 s[14:15]
+; GCN-NEXT: s_add_u32 s14, s14, function_lds_id at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s15, s15, function_lds_id at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[20:21], s[14:15], 0x0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: v_or_b32_e32 v31, v0, v2
; GCN-NEXT: s_mov_b32 s15, 21
+; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GCN-NEXT: v_mov_b32_e32 v0, s18
+; GCN-NEXT: v_mov_b32_e32 v1, s19
+; GCN-NEXT: s_swappc_b64 s[30:31], s[20:21]
; GCN-NEXT: s_endpgm
call void @function_lds_id(ptr addrspace(1) %out)
ret void
}
-define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) #0 !llvm.amdgcn.lds.kernel.id !0 {
+define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
; GCN-LABEL: doesnt_use_it:
; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v2, 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -79,7 +91,6 @@ define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) #0 !llvm.amdgcn.
ret void
}
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
!0 = !{i32 42}
!1 = !{i32 21}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 5ce8d07380e7f..cc9e34be209b4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -280,10 +280,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #0 {
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -294,6 +297,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -305,14 +311,17 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #0 {
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -321,10 +330,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -333,14 +345,17 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #0 {
+define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -349,11 +364,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -362,16 +380,19 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #0 {
+define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_m0:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 m0, -1
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
@@ -379,12 +400,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #0 {
; CHECK-GISEL-LABEL: test_readfirstlane_m0:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 m0, -1
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -394,29 +418,35 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #0 {
ret void
}
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #0 {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 s2, 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -426,17 +456,20 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #0 {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -444,13 +477,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -460,17 +496,20 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #0 {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -478,13 +517,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -1601,4 +1643,3 @@ define void @test_readfirstlane_v32f16(ptr addrspace(1) %out, <32 x half> %src)
ret void
}
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index fb7b2775de608..f2b0959cc706e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -179,6 +179,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -189,6 +192,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -204,10 +210,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -216,10 +225,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -232,10 +244,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -244,11 +259,14 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -262,6 +280,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -281,6 +302,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -311,6 +335,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -332,6 +359,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -365,6 +395,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -386,6 +419,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -419,12 +455,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
; CHECK-SDAG-LABEL: test_readlane_m0_sreg:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 m0, -1
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
@@ -432,12 +471,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
; CHECK-GISEL-LABEL: test_readlane_m0_sreg:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 m0, -1
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -454,11 +496,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v0
; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
@@ -468,10 +513,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; def v0
; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -485,14 +533,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -505,10 +556,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: ; def v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -523,14 +577,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -543,10 +600,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: ; def v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -561,25 +621,31 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou
; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 s2, 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -593,13 +659,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -607,13 +676,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -627,13 +699,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -641,13 +716,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -897,5 +975,5 @@ define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
-attributes #1 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #1 = { nounwind }
attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 3e92d73918cdd..4ac2cc98970b5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -15,6 +15,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
@@ -53,6 +56,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
@@ -98,6 +104,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
@@ -147,6 +156,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
@@ -202,6 +214,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
@@ -251,6 +266,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
@@ -306,6 +324,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -348,6 +369,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -396,6 +420,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
@@ -444,6 +471,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -498,11 +528,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
@@ -551,11 +584,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -609,6 +645,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -668,6 +707,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -738,6 +780,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -803,6 +848,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -877,7 +925,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -886,6 +936,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1]
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
@@ -946,7 +997,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
-; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -956,6 +1009,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000
; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0
; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1028,15 +1082,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
; GFX802-SDAG-NEXT: ;;#ASMSTART
; GFX802-SDAG-NEXT: s_mov_b32 m0, -1
; GFX802-SDAG-NEXT: ;;#ASMEND
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: s_mov_b32 s4, m0
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
; GFX802-SDAG-NEXT: s_endpgm
;
@@ -1081,15 +1138,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
; GFX802-GISEL-NEXT: ;;#ASMSTART
; GFX802-GISEL-NEXT: s_mov_b32 m0, -1
; GFX802-GISEL-NEXT: ;;#ASMEND
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: s_mov_b32 s4, m0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX802-GISEL-NEXT: s_endpgm
;
@@ -1138,6 +1198,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -1180,6 +1243,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1227,6 +1293,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
; GFX802-SDAG-LABEL: test_writelane_imm_i64:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
@@ -1270,6 +1339,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
; GFX802-GISEL-LABEL: test_writelane_imm_i64:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -1319,6 +1391,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
; GFX802-SDAG-LABEL: test_writelane_imm_f64:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
@@ -1362,6 +1437,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
; GFX802-GISEL-LABEL: test_writelane_imm_f64:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -1412,6 +1490,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
@@ -1449,6 +1530,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
@@ -1492,10 +1576,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0
@@ -1538,11 +1625,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -1589,10 +1679,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0
@@ -1635,11 +1728,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -1684,7 +1780,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -1716,7 +1815,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1754,11 +1856,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
@@ -1797,11 +1902,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -1845,11 +1953,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
@@ -1888,11 +1999,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -2690,5 +2804,5 @@ define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %sr
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
-attributes #1 = { nounwind "amdgpu-no-flat-scratch-init"}
+attributes #1 = { nounwind }
attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 6f638e33488bd..919c1dfd4694e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
; GFX7-HSA-LABEL: constant_load_f64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -60,10 +63,10 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
ret void
}
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
; Tests whether a load-chain of 8 constants of 64bit each gets vectorized into a wider load.
-define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) #0 {
+define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) {
; GFX6-NOHSA-LABEL: constant_load_2v4f64:
; GFX6-NOHSA: ; %bb.0: ; %entry
; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9
@@ -90,7 +93,10 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu
;
; GFX7-HSA-LABEL: constant_load_2v4f64:
; GFX7-HSA: ; %bb.0: ; %entry
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 469e2e0975b73..a185157a553cf 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -6,7 +6,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
-define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
; GCN-NOHSA-SI-LABEL: constant_load_i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -27,6 +27,9 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: constant_load_i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -101,7 +104,7 @@ entry:
ret void
}
-define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
; GCN-NOHSA-SI-LABEL: constant_load_v2i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -117,6 +120,9 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v2i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -169,7 +175,7 @@ entry:
ret void
}
-define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
; GCN-NOHSA-SI-LABEL: constant_load_v3i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -188,6 +194,9 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v3i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: s_add_u32 s4, s0, 4
@@ -269,7 +278,7 @@ entry:
ret void
}
-define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
; GCN-NOHSA-SI-LABEL: constant_load_v4i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -286,6 +295,9 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v4i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -341,7 +353,7 @@ entry:
ret void
}
-define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
; GCN-NOHSA-SI-LABEL: constant_load_v8i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -360,6 +372,9 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v8i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -420,7 +435,7 @@ entry:
ret void
}
-define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
; GCN-NOHSA-SI-LABEL: constant_load_v16i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
@@ -445,6 +460,9 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
; GCN-HSA-LABEL: constant_load_v16i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GCN-HSA-NEXT: s_add_u32 s10, s8, 16
@@ -584,6 +602,9 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-HSA-LABEL: constant_load_v16i16_align2:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
@@ -837,6 +858,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_zextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -912,6 +936,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_sextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -988,6 +1015,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1063,6 +1093,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1137,6 +1170,9 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1222,6 +1258,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1288,7 +1327,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
ret void
}
-define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
; GCN-NOHSA-SI-LABEL: constant_zextload_v3i16_to_v3i32:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1311,6 +1350,9 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1385,7 +1427,7 @@ entry:
ret void
}
-define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
; GCN-NOHSA-SI-LABEL: constant_sextload_v3i16_to_v3i32:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1408,6 +1450,9 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1510,6 +1555,9 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1616,6 +1664,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1733,6 +1784,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -1891,6 +1945,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2068,6 +2125,9 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2330,6 +2390,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2637,7 +2700,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3118,7 +3184,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3686,7 +3755,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4602,7 +4674,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -5389,6 +5464,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_zextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5486,6 +5564,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_sextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5584,6 +5665,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5676,6 +5760,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5773,12 +5860,15 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16
; GCN-HSA-NEXT: s_and_b32 s1, s2, 0xffff
@@ -5883,6 +5973,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -5986,10 +6079,13 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16
; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16
@@ -6142,6 +6238,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6298,10 +6397,13 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16
; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16
@@ -6516,6 +6618,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6777,10 +6882,13 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16
; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16
@@ -7162,6 +7270,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -7637,7 +7748,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -8360,7 +8474,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -9079,4 +9196,4 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; ret void
; }
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index ad291e742ebef..68a6a148819e8 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -23,6 +23,9 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
; GFX7-HSA-LABEL: constant_load_i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -103,6 +106,9 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v2i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -190,6 +196,9 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v3i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -284,6 +293,9 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v4i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -383,6 +395,9 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v8i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16
@@ -517,6 +532,9 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v9i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s12, s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -678,6 +696,9 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-LABEL: constant_load_v10i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -847,6 +868,9 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-LABEL: constant_load_v11i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -1023,6 +1047,9 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-LABEL: constant_load_v12i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -1202,7 +1229,10 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
;
; GFX7-HSA-LABEL: constant_load_v16i32:
; GFX7-HSA: ; %bb.0: ; %entry
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_add_u32 s18, s16, 48
@@ -1389,6 +1419,9 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX7-HSA-LABEL: constant_zextload_i32_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1473,6 +1506,9 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX7-HSA-LABEL: constant_sextload_i32_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1563,6 +1599,9 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1647,6 +1686,9 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1739,12 +1781,15 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
@@ -1837,6 +1882,9 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1949,13 +1997,16 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
@@ -2082,6 +2133,9 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2244,8 +2298,10 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48
@@ -2253,6 +2309,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11
@@ -2452,6 +2509,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2748,7 +2808,10 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
;
; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3196,7 +3259,10 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
;
; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3628,7 +3694,10 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
;
; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4479,8 +4548,10 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xf0
; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
@@ -4509,6 +4580,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90
; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31
@@ -5097,7 +5169,10 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
;
; GFX7-HSA-LABEL: constant_load_v32i32:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -5402,4 +5477,4 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
ret void
}
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index fa1b216e0c311..2219ceea7ec9b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
; GFX7-LABEL: constant_load_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
@@ -95,6 +98,9 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-LABEL: constant_load_v2i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-NEXT: v_mov_b32_e32 v4, s0
@@ -179,6 +185,9 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-LABEL: constant_load_v3i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
@@ -294,6 +303,9 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-LABEL: constant_load_v4i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-NEXT: s_add_u32 s10, s8, 16
@@ -421,7 +433,10 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX7-LABEL: constant_load_v8i64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-NEXT: s_add_u32 s18, s16, 48
@@ -638,7 +653,10 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
;
; GFX7-LABEL: constant_load_v16i64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -927,4 +945,4 @@ entry:
ret void
}
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index d9d76ac16d608..4031be65fab61 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -27,6 +27,9 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
; GFX7-HSA-LABEL: constant_load_i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -112,6 +115,9 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v2i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -195,6 +201,9 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v3i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -305,6 +314,9 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v4i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -374,6 +386,9 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v8i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -448,6 +463,9 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v16i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -529,6 +547,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_zextload_i8_to_i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -604,6 +625,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_sextload_i8_to_i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -680,6 +704,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -755,6 +782,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -834,6 +864,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -933,6 +966,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1030,6 +1066,9 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1131,6 +1170,9 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1232,6 +1274,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1336,6 +1381,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1453,6 +1501,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -1612,6 +1663,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -1794,6 +1848,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2060,6 +2117,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2374,6 +2434,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2856,6 +2919,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3437,7 +3503,10 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
;
; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4353,7 +4422,10 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
;
; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -5161,6 +5233,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_zextload_i8_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5243,6 +5318,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_sextload_i8_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5328,6 +5406,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5408,6 +5489,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5496,6 +5580,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5603,6 +5690,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5716,10 +5806,13 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_bfe_u32 s4, s2, 0x80008
; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 24
@@ -5854,6 +5947,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6013,10 +6109,13 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 24
; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24
@@ -6235,6 +6334,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6504,10 +6606,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s8, s5, 24
; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 24
@@ -6898,6 +7003,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -7387,10 +7495,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s16, s8, 24
; GFX7-HSA-NEXT: s_lshr_b32 s17, s9, 24
@@ -8128,6 +8239,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -8898,6 +9012,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_zextload_i8_to_i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -8982,6 +9099,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_sextload_i8_to_i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9068,6 +9188,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9152,6 +9275,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9241,6 +9367,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9340,6 +9469,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9452,6 +9584,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -9560,6 +9695,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -9683,6 +9821,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -9832,6 +9973,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -10014,6 +10158,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -10261,6 +10408,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -10574,6 +10724,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -11018,6 +11171,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -11445,4 +11601,4 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; ret void
; }
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index ec2e6359ad46b..9054e509cde8e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -7,7 +7,7 @@
; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
-define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-NOHSA-SI-LABEL: global_load_i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -28,6 +28,9 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(
; GCN-HSA-LABEL: global_load_i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -112,7 +115,7 @@ entry:
ret void
}
-define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-NOHSA-SI-LABEL: global_load_v2i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -133,6 +136,9 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v2i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -197,7 +203,7 @@ entry:
ret void
}
-define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-NOHSA-SI-LABEL: global_load_v3i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -219,6 +225,9 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v3i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -318,7 +327,7 @@ entry:
ret void
}
-define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-NOHSA-SI-LABEL: global_load_v4i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -339,6 +348,9 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v4i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -403,7 +415,7 @@ entry:
ret void
}
-define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-NOHSA-SI-LABEL: global_load_v8i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -424,6 +436,9 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v8i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -488,7 +503,7 @@ entry:
ret void
}
-define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-NOHSA-SI-LABEL: global_load_v16i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -512,6 +527,9 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
; GCN-HSA-LABEL: global_load_v16i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
@@ -662,6 +680,9 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
; GCN-HSA-LABEL: global_load_v16i16_align2:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -811,6 +832,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_zextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -896,6 +920,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_sextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -984,6 +1011,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1069,6 +1099,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1159,6 +1192,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1258,6 +1294,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1334,7 +1373,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
ret void
}
-define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-NOHSA-SI-LABEL: global_zextload_v3i16_to_v3i32:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1359,6 +1398,9 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1444,7 +1486,7 @@ entry:
ret void
}
-define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-NOHSA-SI-LABEL: global_sextload_v3i16_to_v3i32:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1469,6 +1511,9 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1586,6 +1631,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1701,6 +1749,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1823,6 +1874,9 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1972,6 +2026,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2136,6 +2193,9 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -2372,6 +2432,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2643,6 +2706,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -3054,6 +3120,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -3573,6 +3642,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -4377,6 +4449,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5142,6 +5217,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_zextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5239,6 +5317,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_sextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5334,6 +5415,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5426,6 +5510,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5524,6 +5611,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5633,6 +5723,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5751,6 +5844,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5896,6 +5992,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6056,10 +6155,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6074,8 +6173,11 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v14, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v16, v4
@@ -6275,6 +6377,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6525,10 +6630,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6545,7 +6650,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v18, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v20, v8
@@ -6905,6 +7013,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -7376,6 +7487,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -8078,6 +8192,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -8722,5 +8839,4 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; ret void
; }
-attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-no-flat-scratch-init" }
-attributes #1 = { "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 5652ab3ed4e70..e8c862a3cb93c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -27,6 +27,9 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(
; GCNX3-HSA-LABEL: global_load_i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -106,6 +109,9 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v2i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -186,6 +192,9 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v3i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -270,6 +279,9 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v4i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -352,6 +364,9 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v8i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
@@ -458,6 +473,9 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v9i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -589,6 +607,9 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v10i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -719,6 +740,9 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v11i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -854,6 +878,9 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v12i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -987,6 +1014,9 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v16i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -1134,6 +1164,9 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr
; GCNX3-HSA-LABEL: global_zextload_i32_to_i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1217,6 +1250,9 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr
; GCNX3-HSA-LABEL: global_sextload_i32_to_i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1301,6 +1337,9 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1384,6 +1423,9 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1471,6 +1513,9 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1569,6 +1614,9 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1674,8 +1722,10 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1683,6 +1733,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2
@@ -1800,6 +1851,9 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1941,8 +1995,10 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1957,6 +2013,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9
; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
@@ -2134,6 +2191,9 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2370,6 +2430,9 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2731,8 +2794,10 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -2766,6 +2831,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17
; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
@@ -3122,6 +3188,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -3589,12 +3658,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
;
; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64:
; GCN-GFX900-HSA: ; %bb.0:
-; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-GFX900-HSA-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-GFX900-HSA-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0
-; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15
-; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0
+; GCN-GFX900-HSA-NEXT: s_add_u32 s20, s20, s17
+; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0
; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
@@ -3620,11 +3689,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[20:23], 0 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11
@@ -3667,11 +3736,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[20:23], 0 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51
@@ -3913,6 +3982,9 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -4437,6 +4509,9 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v32i32:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -4649,4 +4724,4 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
ret void
}
-attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index d74064a6da9c5..8d020b9e1a603 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -9,6 +9,8 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
; CHECK-LABEL: memcpy_p0_p0_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v12, s3
; CHECK-NEXT: v_mov_b32_e32 v11, s2
@@ -94,12 +96,12 @@ entry:
define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 {
; CHECK-LABEL: memcpy_p5_p4_minsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
-; CHECK-NEXT: s_add_u32 s16, s16, s15
+; CHECK-NEXT: s_add_u32 s20, s20, s17
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
@@ -107,50 +109,50 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
-; CHECK-NEXT: s_addc_u32 s17, s17, 0
+; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: v_mov_b32_e32 v25, s2
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
-; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
-; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
-; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
+; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
-; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
-; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
-; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
+; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
-; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
-; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
-; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
+; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80
; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
-; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
-; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
-; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
+; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
-; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
-; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
-; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
+; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60
+; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
-; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
-; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
-; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
+; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
-; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -160,55 +162,57 @@ entry:
define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 {
; CHECK-LABEL: memcpy_p0_p5_minsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
-; CHECK-NEXT: s_add_u32 s16, s16, s15
-; CHECK-NEXT: s_addc_u32 s17, s17, 0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; CHECK-NEXT: s_add_u32 s20, s20, s17
+; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v26, s0
-; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
-; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
-; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64
@@ -268,6 +272,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v21, s1
; CHECK-NEXT: v_mov_b32_e32 v20, s0
@@ -294,6 +300,8 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-LABEL: memcpy_p0_p0_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v12, s3
; CHECK-NEXT: v_mov_b32_e32 v11, s2
@@ -379,12 +387,12 @@ entry:
define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 {
; CHECK-LABEL: memcpy_p5_p4_optsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
-; CHECK-NEXT: s_add_u32 s16, s16, s15
+; CHECK-NEXT: s_add_u32 s20, s20, s17
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
@@ -392,50 +400,50 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
-; CHECK-NEXT: s_addc_u32 s17, s17, 0
+; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: v_mov_b32_e32 v25, s2
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
-; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
-; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
-; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
+; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
-; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
-; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
-; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
+; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
-; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
-; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
-; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
+; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80
; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
-; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
-; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
-; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
+; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
-; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
-; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
-; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
+; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60
+; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
-; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
-; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
-; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
+; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
-; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -445,55 +453,57 @@ entry:
define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 {
; CHECK-LABEL: memcpy_p0_p5_optsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
-; CHECK-NEXT: s_add_u32 s16, s16, s15
-; CHECK-NEXT: s_addc_u32 s17, s17, 0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; CHECK-NEXT: s_add_u32 s20, s20, s17
+; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v26, s0
-; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
-; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
-; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64
@@ -553,6 +563,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v21, s1
; CHECK-NEXT: v_mov_b32_e32 v20, s0
@@ -589,6 +601,6 @@ declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly
declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
-attributes #0 = { minsize "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-no-flat-scratch-init" }
-attributes #1 = { optsize "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-no-flat-scratch-init" }
+attributes #0 = { minsize "amdgpu-flat-work-group-size"="1024,1024" }
+attributes #1 = { optsize "amdgpu-flat-work-group-size"="1024,1024" }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index b9916080dffcb..07ad8cb0c4a3d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_agent_unordered_load(
; GFX7-LABEL: flat_agent_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX10-WGP-LABEL: flat_agent_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX10-CU-LABEL: flat_agent_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -172,7 +187,7 @@ define amdgpu_kernel void @flat_agent_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent") unordered, align 4
store i32 %val, ptr %out
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX7-LABEL: flat_agent_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX10-CU-LABEL: flat_agent_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -339,7 +369,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent") monotonic, align 4
store i32 %val, ptr %out
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_load(
; GFX7-LABEL: flat_agent_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX10-WGP-LABEL: flat_agent_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -380,6 +417,10 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX10-CU-LABEL: flat_agent_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -410,6 +451,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -423,6 +466,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -521,7 +566,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent") acquire, align 4
store i32 %val, ptr %out
@@ -531,6 +576,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX7-LABEL: flat_agent_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -547,6 +595,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -565,6 +617,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -598,6 +654,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -612,6 +670,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -729,7 +789,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4
store i32 %val, ptr %out
@@ -739,6 +799,9 @@ entry:
define amdgpu_kernel void @flat_agent_unordered_store(
; GFX7-LABEL: flat_agent_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -750,6 +813,10 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX10-WGP-LABEL: flat_agent_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -761,6 +828,10 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX10-CU-LABEL: flat_agent_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -783,6 +854,8 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -793,6 +866,8 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -864,7 +939,7 @@ define amdgpu_kernel void @flat_agent_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4
ret void
@@ -873,6 +948,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX7-LABEL: flat_agent_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -884,6 +962,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -895,6 +977,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX10-CU-LABEL: flat_agent_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -917,6 +1003,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -927,6 +1015,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -998,7 +1088,7 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4
ret void
@@ -1007,6 +1097,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_store(
; GFX7-LABEL: flat_agent_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1019,6 +1112,10 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX10-WGP-LABEL: flat_agent_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1032,6 +1129,10 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX10-CU-LABEL: flat_agent_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1057,6 +1158,8 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1068,6 +1171,8 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1156,7 +1261,7 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") release, align 4
ret void
@@ -1165,6 +1270,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX7-LABEL: flat_agent_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1177,6 +1285,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1190,6 +1302,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1215,6 +1331,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1226,6 +1344,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1314,7 +1434,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4
ret void
@@ -1323,6 +1443,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX7-LABEL: flat_agent_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1334,6 +1457,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1345,6 +1472,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1367,6 +1498,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1377,6 +1510,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1448,7 +1583,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic
ret void
@@ -1457,6 +1592,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX7-LABEL: flat_agent_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1470,6 +1608,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1485,6 +1627,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1512,6 +1658,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1524,6 +1672,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1613,7 +1763,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
ret void
@@ -1622,6 +1772,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX7-LABEL: flat_agent_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1634,6 +1787,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1647,6 +1804,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1672,6 +1833,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1683,6 +1846,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1771,7 +1936,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release
ret void
@@ -1780,6 +1945,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX7-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1794,6 +1962,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1811,6 +1983,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1841,6 +2017,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1854,6 +2032,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1960,7 +2140,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
ret void
@@ -1969,6 +2149,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX7-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1983,6 +2166,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2000,6 +2187,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2030,6 +2221,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2043,6 +2236,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2149,7 +2344,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
ret void
@@ -2158,6 +2353,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2174,6 +2372,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2191,6 +2393,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2223,6 +2429,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2237,6 +2445,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2342,7 +2552,7 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
store i32 %val, ptr %out, align 4
@@ -2352,6 +2562,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2369,6 +2582,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2388,6 +2605,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2423,6 +2644,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2438,6 +2661,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2564,7 +2789,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
store i32 %val, ptr %out, align 4
@@ -2574,6 +2799,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2591,6 +2819,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2610,6 +2842,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2645,6 +2881,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2660,6 +2898,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2786,7 +3026,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr %out, align 4
@@ -2796,6 +3036,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2821,6 +3064,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2846,6 +3093,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2896,6 +3147,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2910,6 +3163,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3009,7 +3264,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
@@ -3019,6 +3274,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3046,6 +3304,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3075,6 +3337,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3130,6 +3396,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3146,6 +3414,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3263,7 +3533,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
@@ -3273,6 +3543,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3299,6 +3572,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3326,6 +3603,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3379,6 +3660,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3394,6 +3677,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3510,7 +3795,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release monotonic
@@ -3520,6 +3805,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3548,6 +3836,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3579,6 +3871,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3637,6 +3933,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3654,6 +3952,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3788,7 +4088,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
@@ -3798,6 +4098,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3826,6 +4129,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3857,6 +4164,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3915,6 +4226,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3932,6 +4245,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4066,7 +4381,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
@@ -4076,6 +4391,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4103,6 +4421,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4132,6 +4454,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4187,6 +4513,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4203,6 +4531,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4320,7 +4650,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire
@@ -4330,6 +4660,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4357,6 +4690,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4386,6 +4723,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4441,6 +4782,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4457,6 +4800,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4574,7 +4919,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
@@ -4584,6 +4929,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4612,6 +4960,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4643,6 +4995,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4701,6 +5057,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4718,6 +5076,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4852,7 +5212,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release acquire
@@ -4862,6 +5222,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4890,6 +5253,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4921,6 +5288,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4979,6 +5350,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4996,6 +5369,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5130,7 +5505,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
@@ -5140,6 +5515,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5168,6 +5546,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5199,6 +5581,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5257,6 +5643,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5274,6 +5662,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5408,7 +5798,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
@@ -5418,6 +5808,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5446,6 +5839,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5477,6 +5874,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5535,6 +5936,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5552,6 +5955,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5686,7 +6091,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst
@@ -5696,6 +6101,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5724,6 +6132,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5755,6 +6167,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5813,6 +6229,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5830,6 +6248,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5964,7 +6384,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst
@@ -5974,6 +6394,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6002,6 +6425,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6033,6 +6460,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6091,6 +6522,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6108,6 +6541,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6242,7 +6677,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release seq_cst
@@ -6252,6 +6687,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6280,6 +6718,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6311,6 +6753,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6369,6 +6815,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6386,6 +6834,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6520,7 +6970,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst
@@ -6530,6 +6980,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6558,6 +7011,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6589,6 +7046,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6647,6 +7108,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6664,6 +7127,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6798,7 +7263,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -6808,6 +7273,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6837,6 +7305,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6866,6 +7338,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6924,6 +7400,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6941,6 +7419,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7065,7 +7545,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
@@ -7077,6 +7557,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7107,6 +7590,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7138,6 +7625,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7198,6 +7689,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7216,6 +7709,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7349,7 +7844,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
@@ -7361,6 +7856,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7391,6 +7889,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7422,6 +7924,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7483,6 +7989,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7501,6 +8009,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7642,7 +8152,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release monotonic
@@ -7654,6 +8164,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7685,6 +8198,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7718,6 +8235,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7781,6 +8302,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7800,6 +8323,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7954,7 +8479,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
@@ -7966,6 +8491,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7997,6 +8525,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8030,6 +8562,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8093,6 +8629,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8112,6 +8650,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8266,7 +8806,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
@@ -8278,6 +8818,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8308,6 +8851,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8339,6 +8886,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8399,6 +8950,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8417,6 +8970,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8554,7 +9109,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire
@@ -8566,6 +9121,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8596,6 +9154,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8627,6 +9189,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8687,6 +9253,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8705,6 +9273,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8838,7 +9408,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
@@ -8850,6 +9420,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8881,6 +9454,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8914,6 +9491,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8977,6 +9558,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8996,6 +9579,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9150,7 +9735,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release acquire
@@ -9162,6 +9747,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9193,6 +9781,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9226,6 +9818,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9289,6 +9885,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9308,6 +9906,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9462,7 +10062,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
@@ -9474,6 +10074,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9505,6 +10108,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9538,6 +10145,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9601,6 +10212,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9620,6 +10233,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9774,7 +10389,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
@@ -9786,6 +10401,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9817,6 +10435,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9850,6 +10472,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9913,6 +10539,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9932,6 +10560,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10086,7 +10716,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst
@@ -10098,6 +10728,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10129,6 +10762,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10162,6 +10799,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10225,6 +10866,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10244,6 +10887,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10394,7 +11039,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst
@@ -10406,6 +11051,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10437,6 +11085,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10470,6 +11122,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10533,6 +11189,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10552,6 +11210,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10706,7 +11366,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release seq_cst
@@ -10718,6 +11378,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10749,6 +11412,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10782,6 +11449,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10845,6 +11516,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10864,6 +11537,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11018,7 +11693,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst
@@ -11030,6 +11705,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11061,6 +11739,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11094,6 +11776,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11157,6 +11843,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11176,6 +11864,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11330,7 +12020,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -11342,6 +12032,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX7-LABEL: flat_agent_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11356,6 +12049,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11370,6 +12067,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11398,6 +12099,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11410,6 +12113,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11499,7 +12204,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") unordered, align 4
store i32 %val, ptr %out
@@ -11509,6 +12214,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX7-LABEL: flat_agent_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11523,6 +12231,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11537,6 +12249,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11565,6 +12281,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11577,6 +12295,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11666,7 +12386,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") monotonic, align 4
store i32 %val, ptr %out
@@ -11676,6 +12396,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX7-LABEL: flat_agent_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11692,6 +12415,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11709,6 +12436,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11741,6 +12472,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11755,6 +12488,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11858,7 +12593,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") acquire, align 4
store i32 %val, ptr %out
@@ -11868,6 +12603,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX7-LABEL: flat_agent_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11885,6 +12623,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11904,6 +12646,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11939,6 +12685,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11954,6 +12702,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12076,7 +12826,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4
store i32 %val, ptr %out
@@ -12086,6 +12836,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX7-LABEL: flat_agent_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12097,6 +12850,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12108,6 +12865,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12130,6 +12891,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12140,6 +12903,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12211,7 +12976,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4
ret void
@@ -12220,6 +12985,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX7-LABEL: flat_agent_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12231,6 +12999,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12242,6 +13014,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12264,6 +13040,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12274,6 +13052,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12345,7 +13125,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4
ret void
@@ -12354,6 +13134,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX7-LABEL: flat_agent_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12366,6 +13149,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12379,6 +13166,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12404,6 +13195,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12415,6 +13208,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12503,7 +13298,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4
ret void
@@ -12512,6 +13307,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX7-LABEL: flat_agent_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12524,6 +13322,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12537,6 +13339,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12562,6 +13368,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12573,6 +13381,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12661,7 +13471,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4
ret void
@@ -12670,6 +13480,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12681,6 +13494,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12692,6 +13509,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12714,6 +13535,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12724,6 +13547,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12795,7 +13620,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic
ret void
@@ -12804,6 +13629,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12817,6 +13645,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12831,6 +13663,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12857,6 +13693,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12869,6 +13707,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12956,7 +13796,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
ret void
@@ -12965,6 +13805,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12977,6 +13820,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12990,6 +13837,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13015,6 +13866,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13026,6 +13879,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13114,7 +13969,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release
ret void
@@ -13123,6 +13978,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13137,6 +13995,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13153,6 +14015,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13182,6 +14048,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13195,6 +14063,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13299,7 +14169,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
ret void
@@ -13308,6 +14178,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13322,6 +14195,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13338,6 +14215,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13367,6 +14248,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13380,6 +14263,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13484,7 +14369,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
ret void
@@ -13493,6 +14378,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13510,6 +14398,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13528,6 +14420,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13562,6 +14458,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13577,6 +14475,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13687,7 +14587,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
store i32 %val, ptr %out, align 4
@@ -13697,6 +14597,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13715,6 +14618,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13735,6 +14642,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13772,6 +14683,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13788,6 +14701,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13919,7 +14834,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
store i32 %val, ptr %out, align 4
@@ -13929,6 +14844,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13947,6 +14865,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13967,6 +14889,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14004,6 +14930,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14020,6 +14948,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14151,7 +15081,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
store i32 %val, ptr %out, align 4
@@ -14161,6 +15091,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14186,6 +15119,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14211,6 +15148,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14261,6 +15202,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14275,6 +15218,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14374,7 +15319,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
@@ -14384,6 +15329,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14411,6 +15359,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14439,6 +15391,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14493,6 +15449,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14509,6 +15467,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14624,7 +15584,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
@@ -14634,6 +15594,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14660,6 +15623,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14687,6 +15654,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14740,6 +15711,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14755,6 +15728,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14871,7 +15846,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic
@@ -14881,6 +15856,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14909,6 +15887,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14939,6 +15921,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14996,6 +15982,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15013,6 +16001,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15145,7 +16135,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
@@ -15155,6 +16145,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15183,6 +16176,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15213,6 +16210,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15270,6 +16271,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15287,6 +16290,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15419,7 +16424,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
@@ -15429,6 +16434,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15456,6 +16464,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15484,6 +16496,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15538,6 +16554,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15554,6 +16572,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15669,7 +16689,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire
@@ -15679,6 +16699,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15706,6 +16729,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15734,6 +16761,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15788,6 +16819,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15804,6 +16837,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15919,7 +16954,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
@@ -15929,6 +16964,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15957,6 +16995,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15987,6 +17029,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16044,6 +17090,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16061,6 +17109,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16193,7 +17243,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
@@ -16203,6 +17253,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16231,6 +17284,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16261,6 +17318,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16318,6 +17379,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16335,6 +17398,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16467,7 +17532,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
@@ -16477,6 +17542,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16505,6 +17573,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16535,6 +17607,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16592,6 +17668,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16609,6 +17687,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16741,7 +17821,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
@@ -16751,6 +17831,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16779,6 +17862,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16809,6 +17896,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16866,6 +17957,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16883,6 +17976,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17015,7 +18110,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst
@@ -17025,6 +18120,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17053,6 +18151,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17083,6 +18185,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17140,6 +18246,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17157,6 +18265,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17289,7 +18399,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst
@@ -17299,6 +18409,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17327,6 +18440,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17357,6 +18474,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17414,6 +18535,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17431,6 +18554,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17563,7 +18688,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst
@@ -17573,6 +18698,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17601,6 +18729,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17631,6 +18763,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17688,6 +18824,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17705,6 +18843,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17837,7 +18977,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst
@@ -17847,6 +18987,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17875,6 +19018,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17905,6 +19052,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17962,6 +19113,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17979,6 +19132,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18111,7 +19266,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
@@ -18121,6 +19276,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18150,6 +19308,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18179,6 +19341,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18237,6 +19403,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18254,6 +19422,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18378,7 +19548,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
@@ -18390,6 +19560,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18421,6 +19594,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18453,6 +19630,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18515,6 +19696,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18534,6 +19717,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18672,7 +19857,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
@@ -18684,6 +19869,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18714,6 +19902,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18745,6 +19937,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18806,6 +20002,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18824,6 +20022,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18965,7 +20165,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic
@@ -18977,6 +20177,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19009,6 +20212,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19043,6 +20250,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19108,6 +20319,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19128,6 +20341,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19287,7 +20502,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
@@ -19299,6 +20514,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19331,6 +20549,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19365,6 +20587,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19430,6 +20656,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19450,6 +20678,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19609,7 +20839,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
@@ -19621,6 +20851,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19652,6 +20885,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19684,6 +20921,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19746,6 +20987,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19765,6 +21008,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19907,7 +21152,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire
@@ -19919,6 +21164,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19950,6 +21198,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19982,6 +21234,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20044,6 +21300,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20063,6 +21321,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20201,7 +21461,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
@@ -20213,6 +21473,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20245,6 +21508,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20279,6 +21546,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20344,6 +21615,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20364,6 +21637,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20523,7 +21798,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
@@ -20535,6 +21810,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20567,6 +21845,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20601,6 +21883,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20666,6 +21952,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20686,6 +21974,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20845,7 +22135,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
@@ -20857,6 +22147,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20889,6 +22182,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20923,6 +22220,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20988,6 +22289,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21008,6 +22311,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21167,7 +22472,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
@@ -21179,6 +22484,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21211,6 +22519,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21245,6 +22557,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21310,6 +22626,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21330,6 +22648,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21489,7 +22809,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst
@@ -21501,6 +22821,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21533,6 +22856,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21567,6 +22894,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21632,6 +22963,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21652,6 +22985,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21807,7 +23142,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst
@@ -21819,6 +23154,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21851,6 +23189,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21885,6 +23227,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21950,6 +23296,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21970,6 +23318,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22129,7 +23479,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst
@@ -22141,6 +23491,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22173,6 +23526,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22207,6 +23564,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22272,6 +23633,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22292,6 +23655,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22451,7 +23816,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst
@@ -22463,6 +23828,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22495,6 +23863,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22529,6 +23901,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22594,6 +23970,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22614,6 +23992,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22773,7 +24153,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
@@ -22781,5 +24161,3 @@ entry:
store i32 %val0, ptr %out, align 4
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 12ed89a163a6b..3c24c36ec547d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-CU-LABEL: flat_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -172,7 +187,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load i32, ptr %in, align 4, !nontemporal !0
store i32 %val, ptr %out
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX7-LABEL: flat_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -211,6 +229,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
@@ -240,6 +262,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-CU-LABEL: flat_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
@@ -298,6 +324,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
@@ -329,6 +357,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
@@ -525,7 +555,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr %in, i32 %tid
@@ -537,6 +567,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX7-LABEL: flat_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -551,6 +584,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -565,6 +602,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-CU-LABEL: flat_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -593,6 +634,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -605,6 +648,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -694,7 +739,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load i32, ptr %in, align 4
store i32 %val, ptr %out, !nontemporal !0
@@ -704,6 +749,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX7-LABEL: flat_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -732,6 +780,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -759,6 +811,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-CU-LABEL: flat_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -814,6 +870,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -843,6 +901,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1035,7 +1095,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr %in, align 4
@@ -1047,6 +1107,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX7-LABEL: flat_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1062,6 +1125,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1077,6 +1144,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1107,6 +1178,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1120,6 +1193,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1218,13 +1293,12 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4, !nontemporal !0
store i32 %val, ptr %out
ret void
}
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 3dd82b74a2b5d..b88a10ab24a98 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX7-LABEL: flat_singlethread_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX10-CU-LABEL: flat_singlethread_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -172,7 +187,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") unordered, align 4
store i32 %val, ptr %out
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX7-LABEL: flat_singlethread_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -339,7 +369,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") monotonic, align 4
store i32 %val, ptr %out
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX7-LABEL: flat_singlethread_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -363,6 +396,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -377,6 +414,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -405,6 +446,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -417,6 +460,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -506,7 +551,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") acquire, align 4
store i32 %val, ptr %out
@@ -516,6 +561,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX7-LABEL: flat_singlethread_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -530,6 +578,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -544,6 +596,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -572,6 +628,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -584,6 +642,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -673,7 +733,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") seq_cst, align 4
store i32 %val, ptr %out
@@ -683,6 +743,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX7-LABEL: flat_singlethread_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -694,6 +757,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -705,6 +772,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX10-CU-LABEL: flat_singlethread_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -727,6 +798,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -737,6 +810,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -808,7 +883,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4
ret void
@@ -817,6 +892,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX7-LABEL: flat_singlethread_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -828,6 +906,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -839,6 +921,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -861,6 +947,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -871,6 +959,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -942,7 +1032,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4
ret void
@@ -951,6 +1041,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_store(
; GFX7-LABEL: flat_singlethread_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -962,6 +1055,10 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX10-WGP-LABEL: flat_singlethread_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -973,6 +1070,10 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX10-CU-LABEL: flat_singlethread_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -995,6 +1096,8 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1005,6 +1108,8 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1076,7 +1181,7 @@ define amdgpu_kernel void @flat_singlethread_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4
ret void
@@ -1085,6 +1190,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX7-LABEL: flat_singlethread_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1096,6 +1204,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1107,6 +1219,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1129,6 +1245,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1139,6 +1257,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1210,7 +1330,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4
ret void
@@ -1219,6 +1339,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1230,6 +1353,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1241,6 +1368,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1263,6 +1394,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1273,6 +1406,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1344,7 +1479,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic
ret void
@@ -1353,6 +1488,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1364,6 +1502,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1375,6 +1517,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1397,6 +1543,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1407,6 +1555,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1478,7 +1628,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
ret void
@@ -1487,6 +1637,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX7-LABEL: flat_singlethread_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1498,6 +1651,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1509,6 +1666,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1531,6 +1692,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1541,6 +1704,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1612,7 +1777,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release
ret void
@@ -1621,6 +1786,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1632,6 +1800,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1643,6 +1815,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1665,6 +1841,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1675,6 +1853,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1746,7 +1926,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
ret void
@@ -1755,6 +1935,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1766,6 +1949,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1777,6 +1964,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1799,6 +1990,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1809,6 +2002,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1880,7 +2075,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
ret void
@@ -1889,6 +2084,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1904,6 +2102,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1919,6 +2121,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1949,6 +2155,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1962,6 +2170,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2058,7 +2268,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
store i32 %val, ptr %out, align 4
@@ -2068,6 +2278,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2083,6 +2296,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2098,6 +2315,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2128,6 +2349,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2141,6 +2364,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2237,7 +2462,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
store i32 %val, ptr %out, align 4
@@ -2247,6 +2472,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2262,6 +2490,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2277,6 +2509,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2307,6 +2543,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2320,6 +2558,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2416,7 +2656,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
store i32 %val, ptr %out, align 4
@@ -2426,6 +2666,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2451,6 +2694,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2476,6 +2723,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2526,6 +2777,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2540,6 +2793,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2639,7 +2894,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
@@ -2649,6 +2904,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2674,6 +2932,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2699,6 +2961,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2749,6 +3015,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2763,6 +3031,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2862,7 +3132,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
@@ -2872,6 +3142,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2897,6 +3170,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2922,6 +3199,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2972,6 +3253,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2986,6 +3269,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3085,7 +3370,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
@@ -3095,6 +3380,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3120,6 +3408,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3145,6 +3437,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3195,6 +3491,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3209,6 +3507,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3308,7 +3608,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
@@ -3318,6 +3618,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3343,6 +3646,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3368,6 +3675,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3418,6 +3729,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3432,6 +3745,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3531,7 +3846,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
@@ -3541,6 +3856,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3566,6 +3884,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3591,6 +3913,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3641,6 +3967,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3655,6 +3983,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3754,7 +4084,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
@@ -3764,6 +4094,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3789,6 +4122,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3814,6 +4151,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3864,6 +4205,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3878,6 +4221,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3977,7 +4322,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
@@ -3987,6 +4332,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4012,6 +4360,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4037,6 +4389,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4087,6 +4443,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4101,6 +4459,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4200,7 +4560,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
@@ -4210,6 +4570,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4235,6 +4598,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4260,6 +4627,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4310,6 +4681,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4324,6 +4697,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4423,7 +4798,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
@@ -4433,6 +4808,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4458,6 +4836,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4483,6 +4865,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4533,6 +4919,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4547,6 +4935,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4646,7 +5036,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
@@ -4656,6 +5046,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4681,6 +5074,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4706,6 +5103,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4756,6 +5157,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4770,6 +5173,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4869,7 +5274,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
@@ -4879,6 +5284,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4904,6 +5312,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4929,6 +5341,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4979,6 +5395,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4993,6 +5411,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5092,7 +5512,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
@@ -5102,6 +5522,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5127,6 +5550,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5152,6 +5579,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5202,6 +5633,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5216,6 +5649,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5315,7 +5750,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
@@ -5325,6 +5760,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5350,6 +5788,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5375,6 +5817,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5425,6 +5871,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5439,6 +5887,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5538,7 +5988,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
@@ -5548,6 +5998,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5573,6 +6026,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5598,6 +6055,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5648,6 +6109,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5662,6 +6125,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5761,7 +6226,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
@@ -5771,6 +6236,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5800,6 +6268,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5829,6 +6301,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5887,6 +6363,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5904,6 +6382,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6028,7 +6508,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
@@ -6040,6 +6520,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6069,6 +6552,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6098,6 +6585,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6156,6 +6647,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6173,6 +6666,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6297,7 +6792,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
@@ -6309,6 +6804,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6338,6 +6836,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6367,6 +6869,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6425,6 +6931,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6442,6 +6950,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6566,7 +7076,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
@@ -6578,6 +7088,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6607,6 +7120,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6636,6 +7153,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6694,6 +7215,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6711,6 +7234,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6835,7 +7360,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
@@ -6847,6 +7372,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6876,6 +7404,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6905,6 +7437,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6963,6 +7499,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6980,6 +7518,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7104,7 +7644,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
@@ -7116,6 +7656,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7145,6 +7688,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7174,6 +7721,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7232,6 +7783,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7249,6 +7802,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7373,7 +7928,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
@@ -7385,6 +7940,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7414,6 +7972,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7443,6 +8005,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7501,6 +8067,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7518,6 +8086,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7642,7 +8212,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
@@ -7654,6 +8224,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7683,6 +8256,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7712,6 +8289,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7770,6 +8351,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7787,6 +8370,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7911,7 +8496,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
@@ -7923,6 +8508,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7952,6 +8540,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7981,6 +8573,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8039,6 +8635,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8056,6 +8654,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8180,7 +8780,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
@@ -8192,6 +8792,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8221,6 +8824,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8250,6 +8857,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8308,6 +8919,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8325,6 +8938,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8449,7 +9064,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
@@ -8461,6 +9076,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8490,6 +9108,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8519,6 +9141,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8577,6 +9203,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8594,6 +9222,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8718,7 +9348,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
@@ -8730,6 +9360,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8759,6 +9392,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8788,6 +9425,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8846,6 +9487,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8863,6 +9506,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8987,7 +9632,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
@@ -8999,6 +9644,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9028,6 +9676,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9057,6 +9709,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9115,6 +9771,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9132,6 +9790,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9256,7 +9916,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
@@ -9268,6 +9928,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9297,6 +9960,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9326,6 +9993,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9384,6 +10055,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9401,6 +10074,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9525,7 +10200,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
@@ -9537,6 +10212,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9566,6 +10244,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9595,6 +10277,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9653,6 +10339,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9670,6 +10358,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9794,7 +10484,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
@@ -9806,6 +10496,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9820,6 +10513,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -9834,6 +10531,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -9862,6 +10563,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9874,6 +10577,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9963,7 +10668,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") unordered, align 4
store i32 %val, ptr %out
@@ -9973,6 +10678,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9987,6 +10695,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10001,6 +10713,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10029,6 +10745,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10041,6 +10759,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10130,7 +10850,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") monotonic, align 4
store i32 %val, ptr %out
@@ -10140,6 +10860,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10154,6 +10877,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10168,6 +10895,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10196,6 +10927,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10208,6 +10941,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10297,7 +11032,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") acquire, align 4
store i32 %val, ptr %out
@@ -10307,6 +11042,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10321,6 +11059,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10335,6 +11077,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10363,6 +11109,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10375,6 +11123,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10464,7 +11214,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") seq_cst, align 4
store i32 %val, ptr %out
@@ -10474,6 +11224,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10485,6 +11238,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10496,6 +11253,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10518,6 +11279,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10528,6 +11291,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10599,7 +11364,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4
ret void
@@ -10608,6 +11373,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10619,6 +11387,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10630,6 +11402,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10652,6 +11428,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10662,6 +11440,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10733,7 +11513,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4
ret void
@@ -10742,6 +11522,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX7-LABEL: flat_singlethread_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10753,6 +11536,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10764,6 +11551,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10786,6 +11577,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10796,6 +11589,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10867,7 +11662,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4
ret void
@@ -10876,6 +11671,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10887,6 +11685,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10898,6 +11700,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10920,6 +11726,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10930,6 +11738,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11001,7 +11811,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4
ret void
@@ -11010,6 +11820,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11021,6 +11834,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11032,6 +11849,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11054,6 +11875,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11064,6 +11887,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11135,7 +11960,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic
ret void
@@ -11144,6 +11969,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11155,6 +11983,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11166,6 +11998,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11188,6 +12024,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11198,6 +12036,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11269,7 +12109,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
ret void
@@ -11278,6 +12118,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11289,6 +12132,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11300,6 +12147,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11322,6 +12173,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11332,6 +12185,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11403,7 +12258,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release
ret void
@@ -11412,6 +12267,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11423,6 +12281,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11434,6 +12296,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11456,6 +12322,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11466,6 +12334,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11537,7 +12407,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
ret void
@@ -11546,6 +12416,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11557,6 +12430,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11568,6 +12445,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11590,6 +12471,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11600,6 +12483,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11671,7 +12556,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
ret void
@@ -11680,6 +12565,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11695,6 +12583,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11710,6 +12602,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11740,6 +12636,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11753,6 +12651,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11849,7 +12749,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
store i32 %val, ptr %out, align 4
@@ -11859,6 +12759,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11874,6 +12777,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11889,6 +12796,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11919,6 +12830,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11932,6 +12845,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12028,7 +12943,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
store i32 %val, ptr %out, align 4
@@ -12038,6 +12953,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12053,6 +12971,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12068,6 +12990,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12098,6 +13024,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12111,6 +13039,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12207,7 +13137,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
store i32 %val, ptr %out, align 4
@@ -12217,6 +13147,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12242,6 +13175,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12267,6 +13204,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12317,6 +13258,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12331,6 +13274,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12430,7 +13375,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
@@ -12440,6 +13385,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12465,6 +13413,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12490,6 +13442,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12540,6 +13496,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12554,6 +13512,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12653,7 +13613,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
@@ -12663,6 +13623,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12688,6 +13651,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12713,6 +13680,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12763,6 +13734,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12777,6 +13750,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12876,7 +13851,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
@@ -12886,6 +13861,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12911,6 +13889,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12936,6 +13918,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12986,6 +13972,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13000,6 +13988,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13099,7 +14089,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
@@ -13109,6 +14099,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13134,6 +14127,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13159,6 +14156,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13209,6 +14210,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13223,6 +14226,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13322,7 +14327,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
@@ -13332,6 +14337,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13357,6 +14365,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13382,6 +14394,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13432,6 +14448,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13446,6 +14464,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13545,7 +14565,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
@@ -13555,6 +14575,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13580,6 +14603,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13605,6 +14632,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13655,6 +14686,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13669,6 +14702,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13768,7 +14803,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
@@ -13778,6 +14813,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13803,6 +14841,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13828,6 +14870,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13878,6 +14924,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13892,6 +14940,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13991,7 +15041,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
@@ -14001,6 +15051,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14026,6 +15079,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14051,6 +15108,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14101,6 +15162,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14115,6 +15178,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14214,7 +15279,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
@@ -14224,6 +15289,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14249,6 +15317,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14274,6 +15346,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14324,6 +15400,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14338,6 +15416,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14437,7 +15517,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
@@ -14447,6 +15527,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14472,6 +15555,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14497,6 +15584,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14547,6 +15638,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14561,6 +15654,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14660,7 +15755,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
@@ -14670,6 +15765,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14695,6 +15793,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14720,6 +15822,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14770,6 +15876,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14784,6 +15892,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14883,7 +15993,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
@@ -14893,6 +16003,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14918,6 +16031,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14943,6 +16060,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14993,6 +16114,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15007,6 +16130,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15106,7 +16231,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
@@ -15116,6 +16241,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15141,6 +16269,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15166,6 +16298,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15216,6 +16352,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15230,6 +16368,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15329,7 +16469,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
@@ -15339,6 +16479,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15364,6 +16507,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15389,6 +16536,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15439,6 +16590,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15453,6 +16606,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15552,7 +16707,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
@@ -15562,6 +16717,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15591,6 +16749,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15620,6 +16782,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15678,6 +16844,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15695,6 +16863,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15819,7 +16989,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
@@ -15831,6 +17001,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15860,6 +17033,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15889,6 +17066,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15947,6 +17128,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15964,6 +17147,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16088,7 +17273,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
@@ -16100,6 +17285,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16129,6 +17317,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16158,6 +17350,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16216,6 +17412,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16233,6 +17431,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16357,7 +17557,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
@@ -16369,6 +17569,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16398,6 +17601,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16427,6 +17634,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16485,6 +17696,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16502,6 +17715,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16626,7 +17841,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
@@ -16638,6 +17853,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16667,6 +17885,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16696,6 +17918,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16754,6 +17980,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16771,6 +17999,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16895,7 +18125,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
@@ -16907,6 +18137,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16936,6 +18169,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16965,6 +18202,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17023,6 +18264,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17040,6 +18283,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17164,7 +18409,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
@@ -17176,6 +18421,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17205,6 +18453,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17234,6 +18486,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17292,6 +18548,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17309,6 +18567,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17433,7 +18693,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
@@ -17445,6 +18705,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17474,6 +18737,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17503,6 +18770,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17561,6 +18832,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17578,6 +18851,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17702,7 +18977,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
@@ -17714,6 +18989,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17743,6 +19021,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17772,6 +19054,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17830,6 +19116,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17847,6 +19135,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17971,7 +19261,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
@@ -17983,6 +19273,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18012,6 +19305,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18041,6 +19338,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18099,6 +19400,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18116,6 +19419,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18240,7 +19545,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
@@ -18252,6 +19557,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18281,6 +19589,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18310,6 +19622,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18368,6 +19684,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18385,6 +19703,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18509,7 +19829,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
@@ -18521,6 +19841,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18550,6 +19873,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18579,6 +19906,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18637,6 +19968,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18654,6 +19987,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18778,7 +20113,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
@@ -18790,6 +20125,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18819,6 +20157,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18848,6 +20190,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18906,6 +20252,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18923,6 +20271,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19047,7 +20397,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
@@ -19059,6 +20409,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19088,6 +20441,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19117,6 +20474,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19175,6 +20536,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19192,6 +20555,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19316,7 +20681,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
@@ -19328,6 +20693,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19357,6 +20725,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19386,6 +20758,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19444,6 +20820,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19461,6 +20839,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19585,7 +20965,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
@@ -19593,6 +20973,3 @@ entry:
store i32 %val0, ptr %out, align 4
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index dc30a1e2de77a..919fc3e8f4e4f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_system_unordered_load(
; GFX7-LABEL: flat_system_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX10-WGP-LABEL: flat_system_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX10-CU-LABEL: flat_system_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -172,7 +187,7 @@ define amdgpu_kernel void @flat_system_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in unordered, align 4
store i32 %val, ptr %out
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_load(
; GFX7-LABEL: flat_system_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX10-WGP-LABEL: flat_system_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX10-CU-LABEL: flat_system_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -339,7 +369,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in monotonic, align 4
store i32 %val, ptr %out
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_load(
; GFX7-LABEL: flat_system_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX10-WGP-LABEL: flat_system_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -380,6 +417,10 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX10-CU-LABEL: flat_system_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -410,6 +451,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -424,6 +467,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -523,7 +568,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in acquire, align 4
store i32 %val, ptr %out
@@ -533,6 +578,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX7-LABEL: flat_system_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -549,6 +597,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -567,6 +619,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX10-CU-LABEL: flat_system_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -600,6 +656,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -615,6 +673,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -733,7 +793,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in seq_cst, align 4
store i32 %val, ptr %out
@@ -743,6 +803,9 @@ entry:
define amdgpu_kernel void @flat_system_unordered_store(
; GFX7-LABEL: flat_system_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -754,6 +817,10 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX10-WGP-LABEL: flat_system_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -765,6 +832,10 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX10-CU-LABEL: flat_system_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -787,6 +858,8 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -797,6 +870,8 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -868,7 +943,7 @@ define amdgpu_kernel void @flat_system_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out unordered, align 4
ret void
@@ -877,6 +952,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_store(
; GFX7-LABEL: flat_system_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -888,6 +966,10 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX10-WGP-LABEL: flat_system_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -899,6 +981,10 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX10-CU-LABEL: flat_system_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -921,6 +1007,8 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -931,6 +1019,8 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1002,7 +1092,7 @@ define amdgpu_kernel void @flat_system_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out monotonic, align 4
ret void
@@ -1011,6 +1101,9 @@ entry:
define amdgpu_kernel void @flat_system_release_store(
; GFX7-LABEL: flat_system_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1023,6 +1116,10 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX10-WGP-LABEL: flat_system_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1036,6 +1133,10 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX10-CU-LABEL: flat_system_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1061,6 +1162,8 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1073,6 +1176,8 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1164,7 +1269,7 @@ define amdgpu_kernel void @flat_system_release_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out release, align 4
ret void
@@ -1173,6 +1278,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX7-LABEL: flat_system_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1185,6 +1293,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1198,6 +1310,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX10-CU-LABEL: flat_system_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1223,6 +1339,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1235,6 +1353,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1326,7 +1446,7 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out seq_cst, align 4
ret void
@@ -1335,6 +1455,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX7-LABEL: flat_system_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1346,6 +1469,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1357,6 +1484,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1379,6 +1510,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1389,6 +1522,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1460,7 +1595,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in monotonic
ret void
@@ -1469,6 +1604,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX7-LABEL: flat_system_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1482,6 +1620,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1497,6 +1639,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1524,6 +1670,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1537,6 +1685,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1627,7 +1777,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
ret void
@@ -1636,6 +1786,9 @@ entry:
define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX7-LABEL: flat_system_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1648,6 +1801,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1661,6 +1818,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1686,6 +1847,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1698,6 +1861,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1789,7 +1954,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in release
ret void
@@ -1798,6 +1963,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX7-LABEL: flat_system_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1812,6 +1980,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1829,6 +2001,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1859,6 +2035,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1874,6 +2052,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1984,7 +2164,7 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
ret void
@@ -1993,6 +2173,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX7-LABEL: flat_system_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2007,6 +2190,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2024,6 +2211,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2054,6 +2245,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2069,6 +2262,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2179,7 +2374,7 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
ret void
@@ -2188,6 +2383,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2204,6 +2402,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2221,6 +2423,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2253,6 +2459,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2268,6 +2476,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2374,7 +2584,7 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
store i32 %val, ptr %out, align 4
@@ -2384,6 +2594,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2401,6 +2614,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2420,6 +2637,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2455,6 +2676,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2472,6 +2695,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2602,7 +2827,7 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
store i32 %val, ptr %out, align 4
@@ -2612,6 +2837,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2629,6 +2857,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2648,6 +2880,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2683,6 +2919,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2700,6 +2938,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2830,7 +3070,7 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
store i32 %val, ptr %out, align 4
@@ -2840,6 +3080,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2865,6 +3108,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2890,6 +3137,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2940,6 +3191,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2954,6 +3207,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3053,7 +3308,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic monotonic
@@ -3063,6 +3318,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3090,6 +3348,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3119,6 +3381,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3174,6 +3440,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3191,6 +3459,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3309,7 +3579,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire monotonic
@@ -3319,6 +3589,9 @@ entry:
define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3345,6 +3618,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3372,6 +3649,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3425,6 +3706,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3441,6 +3724,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3560,7 +3845,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release monotonic
@@ -3570,6 +3855,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3598,6 +3886,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3629,6 +3921,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3687,6 +3983,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3706,6 +4004,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3844,7 +4144,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel monotonic
@@ -3854,6 +4154,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3882,6 +4185,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3913,6 +4220,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3971,6 +4282,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3990,6 +4303,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4128,7 +4443,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst monotonic
@@ -4138,6 +4453,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4165,6 +4483,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4194,6 +4516,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4249,6 +4575,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4266,6 +4594,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4384,7 +4714,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic acquire
@@ -4394,6 +4724,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4421,6 +4754,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4450,6 +4787,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4505,6 +4846,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4522,6 +4865,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4640,7 +4985,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire acquire
@@ -4650,6 +4995,9 @@ entry:
define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX7-LABEL: flat_system_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4678,6 +5026,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4709,6 +5061,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4767,6 +5123,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4786,6 +5144,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4924,7 +5284,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release acquire
@@ -4934,6 +5294,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4962,6 +5325,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4993,6 +5360,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5051,6 +5422,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5070,6 +5443,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5208,7 +5583,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel acquire
@@ -5218,6 +5593,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5246,6 +5624,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5277,6 +5659,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5335,6 +5721,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5354,6 +5742,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5492,7 +5882,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst acquire
@@ -5502,6 +5892,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5530,6 +5923,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5561,6 +5958,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5619,6 +6020,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5638,6 +6041,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5776,7 +6181,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic seq_cst
@@ -5786,6 +6191,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5814,6 +6222,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5845,6 +6257,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5903,6 +6319,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5922,6 +6340,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6060,7 +6480,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire seq_cst
@@ -6070,6 +6490,9 @@ entry:
define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6098,6 +6521,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6129,6 +6556,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6187,6 +6618,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6206,6 +6639,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6344,7 +6779,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release seq_cst
@@ -6354,6 +6789,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6382,6 +6820,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6413,6 +6855,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6471,6 +6917,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6490,6 +6938,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6628,7 +7078,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel seq_cst
@@ -6638,6 +7088,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6666,6 +7119,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6697,6 +7154,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6755,6 +7216,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6774,6 +7237,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6912,7 +7377,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -6922,6 +7387,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6951,6 +7419,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6980,6 +7452,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7038,6 +7514,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7055,6 +7533,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7179,7 +7659,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic monotonic
@@ -7191,6 +7671,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7221,6 +7704,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7252,6 +7739,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7312,6 +7803,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7331,6 +7824,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7465,7 +7960,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire monotonic
@@ -7477,6 +7972,9 @@ entry:
define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7507,6 +8005,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7538,6 +8040,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7599,6 +8105,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7618,6 +8126,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7762,7 +8272,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release monotonic
@@ -7774,6 +8284,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7805,6 +8318,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7838,6 +8355,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7901,6 +8422,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7922,6 +8445,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8080,7 +8605,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel monotonic
@@ -8092,6 +8617,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8123,6 +8651,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8156,6 +8688,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8219,6 +8755,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8240,6 +8778,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8398,7 +8938,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst monotonic
@@ -8410,6 +8950,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8440,6 +8983,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8471,6 +9018,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8531,6 +9082,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8550,6 +9103,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8688,7 +9243,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic acquire
@@ -8700,6 +9255,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8730,6 +9288,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8761,6 +9323,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8821,6 +9387,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8840,6 +9408,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8974,7 +9544,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire acquire
@@ -8986,6 +9556,9 @@ entry:
define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9017,6 +9590,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9050,6 +9627,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9113,6 +9694,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9134,6 +9717,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9292,7 +9877,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release acquire
@@ -9304,6 +9889,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9335,6 +9923,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9368,6 +9960,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9431,6 +10027,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9452,6 +10050,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9610,7 +10210,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel acquire
@@ -9622,6 +10222,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9653,6 +10256,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9686,6 +10293,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9749,6 +10360,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9770,6 +10383,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9928,7 +10543,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst acquire
@@ -9940,6 +10555,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9971,6 +10589,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10004,6 +10626,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10067,6 +10693,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10088,6 +10716,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10246,7 +10876,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic seq_cst
@@ -10258,6 +10888,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10289,6 +10922,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10322,6 +10959,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10385,6 +11026,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10406,6 +11049,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10560,7 +11205,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire seq_cst
@@ -10572,6 +11217,9 @@ entry:
define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10603,6 +11251,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10636,6 +11288,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10699,6 +11355,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10720,6 +11378,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10878,7 +11538,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release seq_cst
@@ -10890,6 +11550,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10921,6 +11584,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10954,6 +11621,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11017,6 +11688,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11038,6 +11711,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11196,7 +11871,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel seq_cst
@@ -11208,6 +11883,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11239,6 +11917,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11272,6 +11954,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11335,6 +12021,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11356,6 +12044,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11514,7 +12204,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -11526,6 +12216,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX7-LABEL: flat_system_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11540,6 +12233,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11554,6 +12251,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_system_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11582,6 +12283,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11594,6 +12297,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11683,7 +12388,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") unordered, align 4
store i32 %val, ptr %out
@@ -11693,6 +12398,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX7-LABEL: flat_system_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11707,6 +12415,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11721,6 +12433,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11749,6 +12465,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11761,6 +12479,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11850,7 +12570,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") monotonic, align 4
store i32 %val, ptr %out
@@ -11860,6 +12580,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX7-LABEL: flat_system_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11876,6 +12599,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11893,6 +12620,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11925,6 +12656,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11940,6 +12673,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12044,7 +12779,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") acquire, align 4
store i32 %val, ptr %out
@@ -12054,6 +12789,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX7-LABEL: flat_system_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12071,6 +12809,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12090,6 +12832,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12125,6 +12871,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12141,6 +12889,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12264,7 +13014,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4
store i32 %val, ptr %out
@@ -12274,6 +13024,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX7-LABEL: flat_system_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12285,6 +13038,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12296,6 +13053,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_system_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12318,6 +13079,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12328,6 +13091,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12399,7 +13164,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4
ret void
@@ -12408,6 +13173,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX7-LABEL: flat_system_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12419,6 +13187,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12430,6 +13202,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12452,6 +13228,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12462,6 +13240,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12533,7 +13313,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4
ret void
@@ -12542,6 +13322,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX7-LABEL: flat_system_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12554,6 +13337,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12567,6 +13354,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX10-CU-LABEL: flat_system_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12592,6 +13383,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12604,6 +13397,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12695,7 +13490,7 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") release, align 4
ret void
@@ -12704,6 +13499,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX7-LABEL: flat_system_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12716,6 +13514,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12729,6 +13531,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12754,6 +13560,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12766,6 +13574,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12857,7 +13667,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4
ret void
@@ -12866,6 +13676,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12877,6 +13690,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12888,6 +13705,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12910,6 +13731,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12920,6 +13743,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12991,7 +13816,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic
ret void
@@ -13000,6 +13825,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13013,6 +13841,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13027,6 +13859,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13053,6 +13889,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13066,6 +13904,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13154,7 +13994,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
ret void
@@ -13163,6 +14003,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX7-LABEL: flat_system_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13175,6 +14018,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13188,6 +14035,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13213,6 +14064,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13225,6 +14078,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13316,7 +14171,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release
ret void
@@ -13325,6 +14180,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13339,6 +14197,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13355,6 +14217,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13384,6 +14250,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13399,6 +14267,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13507,7 +14377,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
ret void
@@ -13516,6 +14386,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13530,6 +14403,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13546,6 +14423,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13575,6 +14456,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13590,6 +14473,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13698,7 +14583,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
ret void
@@ -13707,6 +14592,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13724,6 +14612,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13742,6 +14634,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13776,6 +14672,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13792,6 +14690,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13903,7 +14803,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
store i32 %val, ptr %out, align 4
@@ -13913,6 +14813,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13931,6 +14834,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13951,6 +14858,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13988,6 +14899,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14006,6 +14919,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14141,7 +15056,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
store i32 %val, ptr %out, align 4
@@ -14151,6 +15066,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14169,6 +15087,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14189,6 +15111,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14226,6 +15152,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14244,6 +15172,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14379,7 +15309,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
store i32 %val, ptr %out, align 4
@@ -14389,6 +15319,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14414,6 +15347,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14439,6 +15376,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14489,6 +15430,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14503,6 +15446,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14602,7 +15547,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
@@ -14612,6 +15557,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14639,6 +15587,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14667,6 +15619,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14721,6 +15677,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14738,6 +15696,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14854,7 +15814,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
@@ -14864,6 +15824,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14890,6 +15853,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14917,6 +15884,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14970,6 +15941,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14986,6 +15959,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15105,7 +16080,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
@@ -15115,6 +16090,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15143,6 +16121,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15173,6 +16155,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15230,6 +16216,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15249,6 +16237,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15385,7 +16375,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
@@ -15395,6 +16385,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15423,6 +16416,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15453,6 +16450,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15510,6 +16511,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15529,6 +16532,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15665,7 +16670,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
@@ -15675,6 +16680,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15702,6 +16710,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15730,6 +16742,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15784,6 +16800,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15801,6 +16819,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15917,7 +16937,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
@@ -15927,6 +16947,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15954,6 +16977,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15982,6 +17009,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16036,6 +17067,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16053,6 +17086,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16169,7 +17204,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
@@ -16179,6 +17214,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16207,6 +17245,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16237,6 +17279,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16294,6 +17340,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16313,6 +17361,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16449,7 +17499,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release acquire
@@ -16459,6 +17509,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16487,6 +17540,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16517,6 +17574,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16574,6 +17635,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16593,6 +17656,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16729,7 +17794,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
@@ -16739,6 +17804,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16767,6 +17835,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16797,6 +17869,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16854,6 +17930,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16873,6 +17951,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17009,7 +18089,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
@@ -17019,6 +18099,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17047,6 +18130,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17077,6 +18164,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17134,6 +18225,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17153,6 +18246,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17289,7 +18384,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
@@ -17299,6 +18394,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17327,6 +18425,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17357,6 +18459,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17414,6 +18520,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17433,6 +18541,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17569,7 +18679,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
@@ -17579,6 +18689,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17607,6 +18720,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17637,6 +18754,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17694,6 +18815,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17713,6 +18836,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17849,7 +18974,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
@@ -17859,6 +18984,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17887,6 +19015,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17917,6 +19049,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17974,6 +19110,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17993,6 +19131,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18129,7 +19269,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
@@ -18139,6 +19279,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18167,6 +19310,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18197,6 +19344,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18254,6 +19405,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18273,6 +19426,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18409,7 +19564,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
@@ -18419,6 +19574,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18448,6 +19606,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18477,6 +19639,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18535,6 +19701,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18552,6 +19720,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18676,7 +19846,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
@@ -18688,6 +19858,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18719,6 +19892,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18751,6 +19928,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18813,6 +19994,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18833,6 +20016,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18972,7 +20157,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
@@ -18984,6 +20169,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19014,6 +20202,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19045,6 +20237,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19106,6 +20302,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19125,6 +20323,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19269,7 +20469,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
@@ -19281,6 +20481,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19313,6 +20516,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19347,6 +20554,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19412,6 +20623,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19434,6 +20647,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19597,7 +20812,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
@@ -19609,6 +20824,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19641,6 +20859,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19675,6 +20897,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19740,6 +20966,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19762,6 +20990,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19925,7 +21155,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
@@ -19937,6 +21167,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19968,6 +21201,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20000,6 +21237,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20062,6 +21303,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20082,6 +21325,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20225,7 +21470,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
@@ -20237,6 +21482,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20268,6 +21516,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20300,6 +21552,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20362,6 +21618,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20382,6 +21640,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20521,7 +21781,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
@@ -20533,6 +21793,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20565,6 +21828,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20599,6 +21866,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20664,6 +21935,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20686,6 +21959,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20849,7 +22124,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release acquire
@@ -20861,6 +22136,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20893,6 +22171,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20927,6 +22209,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20992,6 +22278,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21014,6 +22302,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21177,7 +22467,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
@@ -21189,6 +22479,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21221,6 +22514,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21255,6 +22552,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21320,6 +22621,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21342,6 +22645,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21505,7 +22810,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
@@ -21517,6 +22822,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21549,6 +22857,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21583,6 +22895,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21648,6 +22964,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21670,6 +22988,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21833,7 +23153,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
@@ -21845,6 +23165,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21877,6 +23200,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21911,6 +23238,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21976,6 +23307,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21998,6 +23331,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22157,7 +23492,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
@@ -22169,6 +23504,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22201,6 +23539,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22235,6 +23577,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22300,6 +23646,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22322,6 +23670,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22485,7 +23835,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
@@ -22497,6 +23847,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22529,6 +23882,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22563,6 +23920,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22628,6 +23989,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22650,6 +24013,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22813,7 +24178,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
@@ -22825,6 +24190,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22857,6 +24225,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22891,6 +24263,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22956,6 +24332,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22978,6 +24356,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -23141,7 +24521,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
@@ -23149,5 +24529,3 @@ entry:
store i32 %val0, ptr %out, align 4
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index ed4292454913e..a88e0e217fdb4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -11,6 +11,9 @@
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -26,6 +29,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -41,6 +48,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-CU-LABEL: flat_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -132,7 +143,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4
store i32 %val, ptr %out
@@ -142,6 +153,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX7-LABEL: flat_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -172,6 +186,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
@@ -202,6 +220,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-CU-LABEL: flat_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
@@ -393,7 +415,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr %in, i32 %tid
@@ -405,6 +427,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX7-LABEL: flat_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -420,6 +445,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -435,6 +464,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-CU-LABEL: flat_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -530,7 +563,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load i32, ptr %in, align 4
store volatile i32 %val, ptr %out
@@ -540,6 +573,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX7-LABEL: flat_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -569,6 +605,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -597,6 +637,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-CU-LABEL: flat_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -787,7 +831,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr %in, align 4
@@ -799,6 +843,9 @@ entry:
define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -814,6 +861,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
;
; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -829,6 +880,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
;
; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -916,7 +971,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic volatile i32, ptr %in syncscope("workgroup") acquire, align 4
store i32 %val, ptr %out
@@ -926,6 +981,9 @@ entry:
define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX7-LABEL: flat_volatile_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -938,6 +996,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
;
; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -951,6 +1013,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
;
; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1024,12 +1090,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index 34911b17657bb..7c637a20ab47b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX7-LABEL: flat_wavefront_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX10-WGP-LABEL: flat_wavefront_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX10-CU-LABEL: flat_wavefront_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -172,7 +187,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront") unordered, align 4
store i32 %val, ptr %out
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX7-LABEL: flat_wavefront_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -339,7 +369,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront") monotonic, align 4
store i32 %val, ptr %out
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX7-LABEL: flat_wavefront_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -363,6 +396,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -377,6 +414,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -405,6 +446,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -417,6 +460,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -506,7 +551,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront") acquire, align 4
store i32 %val, ptr %out
@@ -516,6 +561,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX7-LABEL: flat_wavefront_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -530,6 +578,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -544,6 +596,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -572,6 +628,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -584,6 +642,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -673,7 +733,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront") seq_cst, align 4
store i32 %val, ptr %out
@@ -683,6 +743,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX7-LABEL: flat_wavefront_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -694,6 +757,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX10-WGP-LABEL: flat_wavefront_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -705,6 +772,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX10-CU-LABEL: flat_wavefront_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -727,6 +798,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -737,6 +810,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -808,7 +883,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4
ret void
@@ -817,6 +892,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX7-LABEL: flat_wavefront_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -828,6 +906,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -839,6 +921,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -861,6 +947,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -871,6 +959,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -942,7 +1032,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4
ret void
@@ -951,6 +1041,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_store(
; GFX7-LABEL: flat_wavefront_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -962,6 +1055,10 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX10-WGP-LABEL: flat_wavefront_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -973,6 +1070,10 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX10-CU-LABEL: flat_wavefront_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -995,6 +1096,8 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1005,6 +1108,8 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1076,7 +1181,7 @@ define amdgpu_kernel void @flat_wavefront_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4
ret void
@@ -1085,6 +1190,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX7-LABEL: flat_wavefront_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1096,6 +1204,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1107,6 +1219,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1129,6 +1245,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1139,6 +1257,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1210,7 +1330,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4
ret void
@@ -1219,6 +1339,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1230,6 +1353,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1241,6 +1368,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1263,6 +1394,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1273,6 +1406,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1344,7 +1479,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic
ret void
@@ -1353,6 +1488,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX7-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1364,6 +1502,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1375,6 +1517,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1397,6 +1543,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1407,6 +1555,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1478,7 +1628,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
ret void
@@ -1487,6 +1637,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX7-LABEL: flat_wavefront_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1498,6 +1651,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1509,6 +1666,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1531,6 +1692,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1541,6 +1704,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1612,7 +1777,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release
ret void
@@ -1621,6 +1786,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1632,6 +1800,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1643,6 +1815,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1665,6 +1841,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1675,6 +1853,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1746,7 +1926,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
ret void
@@ -1755,6 +1935,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1766,6 +1949,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1777,6 +1964,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1799,6 +1990,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1809,6 +2002,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1880,7 +2075,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
ret void
@@ -1889,6 +2084,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1904,6 +2102,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1919,6 +2121,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1949,6 +2155,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1962,6 +2170,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2058,7 +2268,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
store i32 %val, ptr %out, align 4
@@ -2068,6 +2278,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2083,6 +2296,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2098,6 +2315,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2128,6 +2349,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2141,6 +2364,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2237,7 +2462,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
store i32 %val, ptr %out, align 4
@@ -2247,6 +2472,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2262,6 +2490,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2277,6 +2509,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2307,6 +2543,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2320,6 +2558,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2416,7 +2656,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
store i32 %val, ptr %out, align 4
@@ -2426,6 +2666,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2451,6 +2694,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2476,6 +2723,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2526,6 +2777,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2540,6 +2793,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2639,7 +2894,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
@@ -2649,6 +2904,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2674,6 +2932,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2699,6 +2961,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2749,6 +3015,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2763,6 +3031,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2862,7 +3132,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
@@ -2872,6 +3142,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2897,6 +3170,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2922,6 +3199,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2972,6 +3253,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2986,6 +3269,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3085,7 +3370,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
@@ -3095,6 +3380,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3120,6 +3408,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3145,6 +3437,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3195,6 +3491,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3209,6 +3507,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3308,7 +3608,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
@@ -3318,6 +3618,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3343,6 +3646,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3368,6 +3675,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3418,6 +3729,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3432,6 +3745,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3531,7 +3846,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
@@ -3541,6 +3856,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3566,6 +3884,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3591,6 +3913,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3641,6 +3967,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3655,6 +3983,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3754,7 +4084,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
@@ -3764,6 +4094,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3789,6 +4122,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3814,6 +4151,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3864,6 +4205,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3878,6 +4221,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3977,7 +4322,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
@@ -3987,6 +4332,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4012,6 +4360,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4037,6 +4389,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4087,6 +4443,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4101,6 +4459,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4200,7 +4560,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
@@ -4210,6 +4570,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4235,6 +4598,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4260,6 +4627,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4310,6 +4681,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4324,6 +4697,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4423,7 +4798,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
@@ -4433,6 +4808,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4458,6 +4836,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4483,6 +4865,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4533,6 +4919,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4547,6 +4935,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4646,7 +5036,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
@@ -4656,6 +5046,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4681,6 +5074,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4706,6 +5103,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4756,6 +5157,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4770,6 +5173,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4869,7 +5274,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
@@ -4879,6 +5284,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4904,6 +5312,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4929,6 +5341,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4979,6 +5395,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4993,6 +5411,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5092,7 +5512,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
@@ -5102,6 +5522,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5127,6 +5550,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5152,6 +5579,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5202,6 +5633,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5216,6 +5649,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5315,7 +5750,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
@@ -5325,6 +5760,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5350,6 +5788,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5375,6 +5817,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5425,6 +5871,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5439,6 +5887,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5538,7 +5988,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
@@ -5548,6 +5998,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5573,6 +6026,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5598,6 +6055,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5648,6 +6109,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5662,6 +6125,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5761,7 +6226,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
@@ -5771,6 +6236,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5800,6 +6268,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5829,6 +6301,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5887,6 +6363,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5904,6 +6382,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6028,7 +6508,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
@@ -6040,6 +6520,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6069,6 +6552,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6098,6 +6585,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6156,6 +6647,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6173,6 +6666,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6297,7 +6792,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
@@ -6309,6 +6804,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6338,6 +6836,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6367,6 +6869,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6425,6 +6931,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6442,6 +6950,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6566,7 +7076,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
@@ -6578,6 +7088,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6607,6 +7120,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6636,6 +7153,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6694,6 +7215,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6711,6 +7234,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6835,7 +7360,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
@@ -6847,6 +7372,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6876,6 +7404,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6905,6 +7437,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6963,6 +7499,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6980,6 +7518,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7104,7 +7644,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
@@ -7116,6 +7656,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7145,6 +7688,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7174,6 +7721,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7232,6 +7783,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7249,6 +7802,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7373,7 +7928,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
@@ -7385,6 +7940,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7414,6 +7972,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7443,6 +8005,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7501,6 +8067,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7518,6 +8086,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7642,7 +8212,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
@@ -7654,6 +8224,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7683,6 +8256,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7712,6 +8289,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7770,6 +8351,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7787,6 +8370,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7911,7 +8496,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
@@ -7923,6 +8508,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7952,6 +8540,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7981,6 +8573,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8039,6 +8635,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8056,6 +8654,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8180,7 +8780,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
@@ -8192,6 +8792,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8221,6 +8824,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8250,6 +8857,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8308,6 +8919,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8325,6 +8938,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8449,7 +9064,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
@@ -8461,6 +9076,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8490,6 +9108,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8519,6 +9141,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8577,6 +9203,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8594,6 +9222,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8718,7 +9348,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
@@ -8730,6 +9360,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8759,6 +9392,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8788,6 +9425,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8846,6 +9487,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8863,6 +9506,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8987,7 +9632,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
@@ -8999,6 +9644,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9028,6 +9676,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9057,6 +9709,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9115,6 +9771,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9132,6 +9790,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9256,7 +9916,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
@@ -9268,6 +9928,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9297,6 +9960,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9326,6 +9993,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9384,6 +10055,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9401,6 +10074,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9525,7 +10200,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
@@ -9537,6 +10212,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9566,6 +10244,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9595,6 +10277,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9653,6 +10339,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9670,6 +10358,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9794,7 +10484,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
@@ -9806,6 +10496,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX7-LABEL: flat_wavefront_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9820,6 +10513,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -9834,6 +10531,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -9862,6 +10563,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9874,6 +10577,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9963,7 +10668,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront-one-as") unordered, align 4
store i32 %val, ptr %out
@@ -9973,6 +10678,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9987,6 +10695,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10001,6 +10713,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10029,6 +10745,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10041,6 +10759,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10130,7 +10850,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront-one-as") monotonic, align 4
store i32 %val, ptr %out
@@ -10140,6 +10860,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX7-LABEL: flat_wavefront_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10154,6 +10877,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10168,6 +10895,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10196,6 +10927,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10208,6 +10941,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10297,7 +11032,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront-one-as") acquire, align 4
store i32 %val, ptr %out
@@ -10307,6 +11042,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10321,6 +11059,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10335,6 +11077,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10363,6 +11109,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10375,6 +11123,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10464,7 +11214,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront-one-as") seq_cst, align 4
store i32 %val, ptr %out
@@ -10474,6 +11224,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX7-LABEL: flat_wavefront_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10485,6 +11238,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10496,6 +11253,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10518,6 +11279,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10528,6 +11291,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10599,7 +11364,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4
ret void
@@ -10608,6 +11373,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10619,6 +11387,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10630,6 +11402,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10652,6 +11428,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10662,6 +11440,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10733,7 +11513,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4
ret void
@@ -10742,6 +11522,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX7-LABEL: flat_wavefront_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10753,6 +11536,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10764,6 +11551,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10786,6 +11577,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10796,6 +11589,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10867,7 +11662,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4
ret void
@@ -10876,6 +11671,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10887,6 +11685,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10898,6 +11700,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10920,6 +11726,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10930,6 +11738,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11001,7 +11811,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4
ret void
@@ -11010,6 +11820,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11021,6 +11834,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11032,6 +11849,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11054,6 +11875,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11064,6 +11887,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11135,7 +11960,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic
ret void
@@ -11144,6 +11969,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11155,6 +11983,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11166,6 +11998,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11188,6 +12024,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11198,6 +12036,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11269,7 +12109,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
ret void
@@ -11278,6 +12118,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11289,6 +12132,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11300,6 +12147,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11322,6 +12173,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11332,6 +12185,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11403,7 +12258,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release
ret void
@@ -11412,6 +12267,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11423,6 +12281,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11434,6 +12296,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11456,6 +12322,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11466,6 +12334,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11537,7 +12407,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
ret void
@@ -11546,6 +12416,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11557,6 +12430,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11568,6 +12445,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11590,6 +12471,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11600,6 +12483,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11671,7 +12556,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
ret void
@@ -11680,6 +12565,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11695,6 +12583,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11710,6 +12602,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11740,6 +12636,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11753,6 +12651,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11849,7 +12749,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
store i32 %val, ptr %out, align 4
@@ -11859,6 +12759,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11874,6 +12777,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11889,6 +12796,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11919,6 +12830,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11932,6 +12845,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12028,7 +12943,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
store i32 %val, ptr %out, align 4
@@ -12038,6 +12953,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12053,6 +12971,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12068,6 +12990,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12098,6 +13024,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12111,6 +13039,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12207,7 +13137,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
store i32 %val, ptr %out, align 4
@@ -12217,6 +13147,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12242,6 +13175,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12267,6 +13204,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12317,6 +13258,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12331,6 +13274,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12430,7 +13375,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
@@ -12440,6 +13385,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12465,6 +13413,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12490,6 +13442,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12540,6 +13496,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12554,6 +13512,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12653,7 +13613,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
@@ -12663,6 +13623,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12688,6 +13651,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12713,6 +13680,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12763,6 +13734,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12777,6 +13750,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12876,7 +13851,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
@@ -12886,6 +13861,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12911,6 +13889,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12936,6 +13918,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12986,6 +13972,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13000,6 +13988,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13099,7 +14089,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
@@ -13109,6 +14099,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13134,6 +14127,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13159,6 +14156,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13209,6 +14210,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13223,6 +14226,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13322,7 +14327,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
@@ -13332,6 +14337,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13357,6 +14365,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13382,6 +14394,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13432,6 +14448,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13446,6 +14464,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13545,7 +14565,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
@@ -13555,6 +14575,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13580,6 +14603,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13605,6 +14632,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13655,6 +14686,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13669,6 +14702,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13768,7 +14803,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
@@ -13778,6 +14813,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13803,6 +14841,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13828,6 +14870,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13878,6 +14924,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13892,6 +14940,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13991,7 +15041,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
@@ -14001,6 +15051,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14026,6 +15079,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14051,6 +15108,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14101,6 +15162,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14115,6 +15178,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14214,7 +15279,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
@@ -14224,6 +15289,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14249,6 +15317,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14274,6 +15346,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14324,6 +15400,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14338,6 +15416,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14437,7 +15517,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
@@ -14447,6 +15527,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14472,6 +15555,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14497,6 +15584,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14547,6 +15638,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14561,6 +15654,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14660,7 +15755,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
@@ -14670,6 +15765,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14695,6 +15793,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14720,6 +15822,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14770,6 +15876,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14784,6 +15892,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14883,7 +15993,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
@@ -14893,6 +16003,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14918,6 +16031,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14943,6 +16060,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14993,6 +16114,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15007,6 +16130,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15106,7 +16231,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
@@ -15116,6 +16241,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15141,6 +16269,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15166,6 +16298,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15216,6 +16352,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15230,6 +16368,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15329,7 +16469,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
@@ -15339,6 +16479,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15364,6 +16507,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15389,6 +16536,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15439,6 +16590,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15453,6 +16606,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15552,7 +16707,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
@@ -15562,6 +16717,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15591,6 +16749,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15620,6 +16782,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15678,6 +16844,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15695,6 +16863,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15819,7 +16989,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
@@ -15831,6 +17001,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15860,6 +17033,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15889,6 +17066,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15947,6 +17128,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15964,6 +17147,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16088,7 +17273,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
@@ -16100,6 +17285,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16129,6 +17317,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16158,6 +17350,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16216,6 +17412,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16233,6 +17431,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16357,7 +17557,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
@@ -16369,6 +17569,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16398,6 +17601,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16427,6 +17634,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16485,6 +17696,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16502,6 +17715,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16626,7 +17841,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
@@ -16638,6 +17853,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16667,6 +17885,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16696,6 +17918,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16754,6 +17980,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16771,6 +17999,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16895,7 +18125,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
@@ -16907,6 +18137,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16936,6 +18169,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16965,6 +18202,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17023,6 +18264,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17040,6 +18283,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17164,7 +18409,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
@@ -17176,6 +18421,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17205,6 +18453,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17234,6 +18486,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17292,6 +18548,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17309,6 +18567,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17433,7 +18693,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
@@ -17445,6 +18705,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17474,6 +18737,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17503,6 +18770,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17561,6 +18832,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17578,6 +18851,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17702,7 +18977,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
@@ -17714,6 +18989,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17743,6 +19021,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17772,6 +19054,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17830,6 +19116,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17847,6 +19135,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17971,7 +19261,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
@@ -17983,6 +19273,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18012,6 +19305,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18041,6 +19338,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18099,6 +19400,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18116,6 +19419,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18240,7 +19545,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
@@ -18252,6 +19557,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18281,6 +19589,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18310,6 +19622,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18368,6 +19684,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18385,6 +19703,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18509,7 +19829,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
@@ -18521,6 +19841,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18550,6 +19873,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18579,6 +19906,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18637,6 +19968,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18654,6 +19987,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18778,7 +20113,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
@@ -18790,6 +20125,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18819,6 +20157,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18848,6 +20190,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18906,6 +20252,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18923,6 +20271,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19047,7 +20397,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
@@ -19059,6 +20409,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19088,6 +20441,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19117,6 +20474,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19175,6 +20536,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19192,6 +20555,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19316,7 +20681,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
@@ -19324,5 +20689,3 @@ entry:
store i32 %val0, ptr %out, align 4
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 915135224b982..0fd4aa4a7a93f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX7-LABEL: flat_workgroup_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX10-CU-LABEL: flat_workgroup_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -172,7 +187,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") unordered, align 4
store i32 %val, ptr %out
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX7-LABEL: flat_workgroup_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -339,7 +369,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") monotonic, align 4
store i32 %val, ptr %out
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX7-LABEL: flat_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -379,6 +416,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -409,6 +450,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -422,6 +465,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -518,7 +563,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") acquire, align 4
store i32 %val, ptr %out
@@ -528,6 +573,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX7-LABEL: flat_workgroup_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -544,6 +592,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -561,6 +613,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -593,6 +649,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -607,6 +665,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -716,7 +776,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4
store i32 %val, ptr %out
@@ -726,6 +786,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX7-LABEL: flat_workgroup_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -737,6 +800,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -748,6 +815,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX10-CU-LABEL: flat_workgroup_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -770,6 +841,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -780,6 +853,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -851,7 +926,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4
ret void
@@ -860,6 +935,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX7-LABEL: flat_workgroup_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -871,6 +949,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -882,6 +964,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -904,6 +990,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -914,6 +1002,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -985,7 +1075,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4
ret void
@@ -994,6 +1084,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_store(
; GFX7-LABEL: flat_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1006,6 +1099,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX10-WGP-LABEL: flat_workgroup_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1019,6 +1116,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX10-CU-LABEL: flat_workgroup_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1043,6 +1144,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1054,6 +1157,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1136,7 +1241,7 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4
ret void
@@ -1145,6 +1250,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX7-LABEL: flat_workgroup_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1157,6 +1265,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1170,6 +1282,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1194,6 +1310,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1205,6 +1323,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1287,7 +1407,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4
ret void
@@ -1296,6 +1416,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1307,6 +1430,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1318,6 +1445,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1340,6 +1471,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1350,6 +1483,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1421,7 +1556,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic
ret void
@@ -1430,6 +1565,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1442,6 +1580,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1456,6 +1598,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1480,6 +1626,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1491,6 +1639,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1574,7 +1724,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
ret void
@@ -1583,6 +1733,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX7-LABEL: flat_workgroup_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1595,6 +1748,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1608,6 +1765,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1632,6 +1793,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1643,6 +1806,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1725,7 +1890,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release
ret void
@@ -1734,6 +1899,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1747,6 +1915,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1763,6 +1935,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1789,6 +1965,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1801,6 +1979,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1895,7 +2075,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
ret void
@@ -1904,6 +2084,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1917,6 +2100,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1933,6 +2120,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1959,6 +2150,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1971,6 +2164,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2065,7 +2260,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
ret void
@@ -2074,6 +2269,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2090,6 +2288,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2106,6 +2308,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2138,6 +2344,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2152,6 +2360,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2255,7 +2465,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
store i32 %val, ptr %out, align 4
@@ -2265,6 +2475,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2282,6 +2495,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2300,6 +2517,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2334,6 +2555,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2349,6 +2572,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2465,7 +2690,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
store i32 %val, ptr %out, align 4
@@ -2475,6 +2700,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2492,6 +2720,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2510,6 +2742,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2544,6 +2780,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2559,6 +2797,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2675,7 +2915,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr %out, align 4
@@ -2685,6 +2925,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2710,6 +2953,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2735,6 +2982,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2785,6 +3036,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2799,6 +3052,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2898,7 +3153,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
@@ -2908,6 +3163,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2934,6 +3192,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2962,6 +3224,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3014,6 +3280,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3029,6 +3297,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3140,7 +3410,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
@@ -3150,6 +3420,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3176,6 +3449,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3203,6 +3480,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3255,6 +3536,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3270,6 +3553,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3380,7 +3665,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
@@ -3390,6 +3675,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3417,6 +3705,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3447,6 +3739,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3501,6 +3797,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3517,6 +3815,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3639,7 +3939,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
@@ -3649,6 +3949,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3676,6 +3979,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3706,6 +4013,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3760,6 +4071,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3776,6 +4089,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3898,7 +4213,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
@@ -3908,6 +4223,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3934,6 +4252,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3962,6 +4284,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4014,6 +4340,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4029,6 +4357,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4140,7 +4470,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
@@ -4150,6 +4480,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4176,6 +4509,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4204,6 +4541,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4256,6 +4597,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4271,6 +4614,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4382,7 +4727,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
@@ -4392,6 +4737,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4419,6 +4767,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4449,6 +4801,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4503,6 +4859,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4519,6 +4877,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4641,7 +5001,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
@@ -4651,6 +5011,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4678,6 +5041,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4708,6 +5075,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4762,6 +5133,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4778,6 +5151,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4900,7 +5275,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
@@ -4910,6 +5285,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4937,6 +5315,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4967,6 +5349,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5021,6 +5407,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5037,6 +5425,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5159,7 +5549,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
@@ -5169,6 +5559,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5196,6 +5589,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5226,6 +5623,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5280,6 +5681,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5296,6 +5699,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5418,7 +5823,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
@@ -5428,6 +5833,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5457,6 +5865,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5486,6 +5898,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5544,6 +5960,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5561,6 +5979,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5685,7 +6105,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
@@ -5697,6 +6117,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5727,6 +6150,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5757,6 +6184,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5817,6 +6248,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5835,6 +6268,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5966,7 +6401,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
@@ -5978,6 +6413,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6008,6 +6446,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6039,6 +6481,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6099,6 +6545,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6117,6 +6565,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6252,7 +6702,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
@@ -6264,6 +6714,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6295,6 +6748,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6327,6 +6784,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6389,6 +6850,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6408,6 +6871,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6552,7 +7017,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
@@ -6564,6 +7029,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6595,6 +7063,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6627,6 +7099,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6689,6 +7165,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6708,6 +7186,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6852,7 +7332,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
@@ -6864,6 +7344,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6894,6 +7377,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6924,6 +7411,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6984,6 +7475,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7002,6 +7495,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7135,7 +7630,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
@@ -7147,6 +7642,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7177,6 +7675,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7207,6 +7709,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7267,6 +7773,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7285,6 +7793,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7416,7 +7926,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
@@ -7428,6 +7938,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7459,6 +7972,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7491,6 +8008,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7553,6 +8074,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7572,6 +8095,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7716,7 +8241,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
@@ -7728,6 +8253,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7759,6 +8287,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7791,6 +8323,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7853,6 +8389,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7872,6 +8410,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8016,7 +8556,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
@@ -8028,6 +8568,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8059,6 +8602,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8091,6 +8638,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8153,6 +8704,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8172,6 +8725,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8316,7 +8871,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
@@ -8328,6 +8883,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8359,6 +8917,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8391,6 +8953,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8453,6 +9019,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8472,6 +9040,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8616,7 +9186,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst
@@ -8628,6 +9198,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8659,6 +9232,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8691,6 +9268,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8753,6 +9334,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8772,6 +9355,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8914,7 +9499,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst
@@ -8926,6 +9511,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8957,6 +9545,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8989,6 +9581,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9051,6 +9647,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9070,6 +9668,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9214,7 +9814,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst
@@ -9226,6 +9826,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9257,6 +9860,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9289,6 +9896,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9351,6 +9962,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9370,6 +9983,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9514,7 +10129,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst
@@ -9526,6 +10141,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9557,6 +10175,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9589,6 +10211,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9651,6 +10277,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9670,6 +10298,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9814,7 +10444,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
@@ -9826,6 +10456,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9840,6 +10473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -9854,6 +10491,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -9882,6 +10523,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9894,6 +10537,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9983,7 +10628,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") unordered, align 4
store i32 %val, ptr %out
@@ -9993,6 +10638,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10007,6 +10655,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10021,6 +10673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10049,6 +10705,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10061,6 +10719,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10150,7 +10810,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") monotonic, align 4
store i32 %val, ptr %out
@@ -10160,6 +10820,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10174,6 +10837,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10190,6 +10857,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10218,6 +10889,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10230,6 +10903,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10325,7 +11000,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") acquire, align 4
store i32 %val, ptr %out
@@ -10335,6 +11010,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10349,6 +11027,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10367,6 +11049,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10395,6 +11081,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10407,6 +11095,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10512,7 +11202,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %in, ptr %out) #0 {
+ ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4
store i32 %val, ptr %out
@@ -10522,6 +11212,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10533,6 +11226,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10544,6 +11241,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10566,6 +11267,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10576,6 +11279,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10647,7 +11352,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4
ret void
@@ -10656,6 +11361,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10667,6 +11375,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10678,6 +11390,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10700,6 +11416,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10710,6 +11428,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10781,7 +11501,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4
ret void
@@ -10790,6 +11510,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX7-LABEL: flat_workgroup_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10801,6 +11524,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10814,6 +11541,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10836,6 +11567,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10846,6 +11579,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10925,7 +11660,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4
ret void
@@ -10934,6 +11669,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10945,6 +11683,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10958,6 +11700,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10980,6 +11726,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10990,6 +11738,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11069,7 +11819,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr %out) #0 {
+ i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4
ret void
@@ -11078,6 +11828,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11089,6 +11842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11100,6 +11857,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11122,6 +11883,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11132,6 +11895,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11203,7 +11968,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic
ret void
@@ -11212,6 +11977,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11223,6 +11991,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11236,6 +12008,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11258,6 +12034,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11268,6 +12046,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11347,7 +12127,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
ret void
@@ -11356,6 +12136,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11367,6 +12150,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11380,6 +12167,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11402,6 +12193,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11412,6 +12205,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11491,7 +12286,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release
ret void
@@ -11500,6 +12295,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11511,6 +12309,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11526,6 +12328,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11548,6 +12354,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11558,6 +12366,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11645,7 +12455,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
ret void
@@ -11654,6 +12464,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11665,6 +12478,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11680,6 +12497,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11702,6 +12523,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11712,6 +12535,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11799,7 +12624,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
ret void
@@ -11808,6 +12633,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11823,6 +12651,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11840,6 +12672,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11870,6 +12706,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11883,6 +12721,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11985,7 +12825,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
store i32 %val, ptr %out, align 4
@@ -11995,6 +12835,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12010,6 +12853,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12029,6 +12876,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12059,6 +12910,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12072,6 +12925,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12184,7 +13039,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
store i32 %val, ptr %out, align 4
@@ -12194,6 +13049,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12209,6 +13067,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12228,6 +13090,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12258,6 +13124,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12271,6 +13139,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12383,7 +13253,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in) #0 {
+ ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
store i32 %val, ptr %out, align 4
@@ -12393,6 +13263,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12418,6 +13291,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12443,6 +13320,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12493,6 +13374,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12507,6 +13390,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12606,7 +13491,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
@@ -12616,6 +13501,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12641,6 +13529,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12668,6 +13560,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12718,6 +13614,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12732,6 +13630,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12839,7 +13739,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
@@ -12849,6 +13749,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12874,6 +13777,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12901,6 +13808,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12951,6 +13862,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12965,6 +13878,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13072,7 +13987,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
@@ -13082,6 +13997,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13107,6 +14025,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13136,6 +14058,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13186,6 +14112,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13200,6 +14128,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13315,7 +14245,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
@@ -13325,6 +14255,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13350,6 +14283,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13379,6 +14316,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13429,6 +14370,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13443,6 +14386,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13558,7 +14503,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
@@ -13568,6 +14513,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13593,6 +14541,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13620,6 +14572,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13670,6 +14626,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13684,6 +14642,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13791,7 +14751,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
@@ -13801,6 +14761,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13826,6 +14789,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13853,6 +14820,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13903,6 +14874,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13917,6 +14890,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14024,7 +14999,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
@@ -14034,6 +15009,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14059,6 +15037,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14088,6 +15070,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14138,6 +15124,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14152,6 +15140,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14267,7 +15257,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
@@ -14277,6 +15267,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14302,6 +15295,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14331,6 +15328,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14381,6 +15382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14395,6 +15398,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14510,7 +15515,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
@@ -14520,6 +15525,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14545,6 +15553,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14574,6 +15586,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14624,6 +15640,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14638,6 +15656,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14753,7 +15773,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
@@ -14763,6 +15783,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14788,6 +15811,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14817,6 +15844,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14867,6 +15898,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14881,6 +15914,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14996,7 +16031,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
@@ -15006,6 +16041,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15031,6 +16069,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15060,6 +16102,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15110,6 +16156,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15124,6 +16172,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15239,7 +16289,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
@@ -15249,6 +16299,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15274,6 +16327,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15303,6 +16360,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15353,6 +16414,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15367,6 +16430,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15482,7 +16547,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
@@ -15492,6 +16557,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15517,6 +16585,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15546,6 +16618,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15596,6 +16672,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15610,6 +16688,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15725,7 +16805,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
@@ -15735,6 +16815,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15760,6 +16843,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15789,6 +16876,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15839,6 +16930,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15853,6 +16946,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15968,7 +17063,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
@@ -15978,6 +17073,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16007,6 +17105,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16036,6 +17138,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16094,6 +17200,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16111,6 +17219,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16235,7 +17345,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
@@ -16247,6 +17357,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16276,6 +17389,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16307,6 +17424,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16365,6 +17486,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16382,6 +17505,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16512,7 +17637,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
@@ -16524,6 +17649,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16553,6 +17681,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16584,6 +17716,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16642,6 +17778,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16659,6 +17797,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16791,7 +17931,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
@@ -16803,6 +17943,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16832,6 +17975,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16865,6 +18012,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16923,6 +18074,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16940,6 +18093,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17080,7 +18235,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
@@ -17092,6 +18247,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17121,6 +18279,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17154,6 +18316,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17212,6 +18378,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17229,6 +18397,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17369,7 +18539,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
@@ -17381,6 +18551,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17410,6 +18583,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17441,6 +18618,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17499,6 +18680,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17516,6 +18699,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17648,7 +18833,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
@@ -17660,6 +18845,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17689,6 +18877,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17720,6 +18912,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17778,6 +18974,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17795,6 +18993,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17925,7 +19125,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
@@ -17937,6 +19137,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17966,6 +19169,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17999,6 +19206,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18057,6 +19268,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18074,6 +19287,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18214,7 +19429,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
@@ -18226,6 +19441,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18255,6 +19473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18288,6 +19510,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18346,6 +19572,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18363,6 +19591,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18503,7 +19733,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
@@ -18515,6 +19745,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18544,6 +19777,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18577,6 +19814,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18635,6 +19876,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18652,6 +19895,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18792,7 +20037,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
@@ -18804,6 +20049,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18833,6 +20081,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18866,6 +20118,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18924,6 +20180,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18941,6 +20199,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19081,7 +20341,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
@@ -19093,6 +20353,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19122,6 +20385,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19155,6 +20422,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19213,6 +20484,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19230,6 +20503,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19368,7 +20643,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
@@ -19380,6 +20655,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19409,6 +20687,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19442,6 +20724,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19500,6 +20786,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19517,6 +20805,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19657,7 +20947,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
@@ -19669,6 +20959,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19698,6 +20991,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19731,6 +21028,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19789,6 +21090,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19806,6 +21109,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19946,7 +21251,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
@@ -19958,6 +21263,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19987,6 +21295,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20020,6 +21332,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20078,6 +21394,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20095,6 +21413,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20235,7 +21555,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
- ptr %out, i32 %in, i32 %old) #0 {
+ ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
@@ -20243,5 +21563,3 @@ entry:
store i32 %val0, ptr %out, align 4
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 9e6226516f0b8..8b600c835a160 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_agent_unordered_load(
;
; GFX7-LABEL: global_agent_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -187,7 +190,7 @@ define amdgpu_kernel void @global_agent_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_agent_monotonic_load(
;
; GFX7-LABEL: global_agent_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -368,7 +374,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -404,6 +410,9 @@ define amdgpu_kernel void @global_agent_acquire_load(
;
; GFX7-LABEL: global_agent_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -565,7 +574,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -602,6 +611,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
;
; GFX7-LABEL: global_agent_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -781,7 +793,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -809,6 +821,9 @@ define amdgpu_kernel void @global_agent_unordered_store(
;
; GFX7-LABEL: global_agent_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -935,7 +950,7 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4
ret void
@@ -962,6 +977,9 @@ define amdgpu_kernel void @global_agent_monotonic_store(
;
; GFX7-LABEL: global_agent_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1088,7 +1106,7 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4
ret void
@@ -1116,6 +1134,9 @@ define amdgpu_kernel void @global_agent_release_store(
;
; GFX7-LABEL: global_agent_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1266,7 +1287,7 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4
ret void
@@ -1294,6 +1315,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
;
; GFX7-LABEL: global_agent_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1444,7 +1468,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4
ret void
@@ -1470,6 +1494,9 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
;
; GFX7-LABEL: global_agent_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1595,7 +1622,7 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic
ret void
@@ -1623,6 +1650,9 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
;
; GFX7-LABEL: global_agent_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1775,7 +1805,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
ret void
@@ -1802,6 +1832,9 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
;
; GFX7-LABEL: global_agent_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1951,7 +1984,7 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release
ret void
@@ -1980,6 +2013,9 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_agent_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2156,7 +2192,7 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
ret void
@@ -2185,6 +2221,9 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_agent_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2361,7 +2400,7 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
ret void
@@ -2390,6 +2429,9 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2556,7 +2598,7 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2587,6 +2629,9 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2781,7 +2826,7 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2812,6 +2857,9 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -3006,7 +3054,7 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -3038,6 +3086,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3222,7 +3273,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
@@ -3256,6 +3307,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3467,7 +3521,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
@@ -3500,6 +3554,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3708,7 +3765,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") release monotonic
@@ -3743,6 +3800,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3978,7 +4038,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
@@ -4013,6 +4073,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4248,7 +4311,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
@@ -4282,6 +4345,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4493,7 +4559,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire
@@ -4527,6 +4593,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4738,7 +4807,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
@@ -4773,6 +4842,9 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5008,7 +5080,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") release acquire
@@ -5043,6 +5115,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5278,7 +5353,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
@@ -5313,6 +5388,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5548,7 +5626,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
@@ -5583,6 +5661,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5818,7 +5899,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst
@@ -5853,6 +5934,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6088,7 +6172,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst
@@ -6123,6 +6207,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6358,7 +6445,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") release seq_cst
@@ -6393,6 +6480,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6628,7 +6718,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst
@@ -6663,6 +6753,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6898,7 +6991,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -6933,6 +7026,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7144,7 +7240,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
@@ -7182,6 +7278,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7409,7 +7508,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
@@ -7447,6 +7546,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7682,7 +7784,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") release monotonic
@@ -7721,6 +7823,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7976,7 +8081,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
@@ -8015,6 +8120,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8270,7 +8378,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
@@ -8308,6 +8416,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8539,7 +8650,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire
@@ -8577,6 +8688,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8804,7 +8918,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
@@ -8843,6 +8957,9 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9098,7 +9215,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") release acquire
@@ -9137,6 +9254,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9392,7 +9512,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
@@ -9431,6 +9551,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9686,7 +9809,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
@@ -9725,6 +9848,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9980,7 +10106,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst
@@ -10019,6 +10145,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10270,7 +10399,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst
@@ -10309,6 +10438,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10564,7 +10696,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") release seq_cst
@@ -10603,6 +10735,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10858,7 +10993,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst
@@ -10897,6 +11032,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11152,7 +11290,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -11189,6 +11327,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
;
; GFX7-LABEL: global_agent_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11335,7 +11476,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -11370,6 +11511,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
;
; GFX7-LABEL: global_agent_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11516,7 +11660,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -11552,6 +11696,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
;
; GFX7-LABEL: global_agent_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11713,7 +11860,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -11750,6 +11897,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11929,7 +12079,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -11957,6 +12107,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
;
; GFX7-LABEL: global_agent_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12083,7 +12236,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4
ret void
@@ -12110,6 +12263,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
;
; GFX7-LABEL: global_agent_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12236,7 +12392,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4
ret void
@@ -12264,6 +12420,9 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
;
; GFX7-LABEL: global_agent_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12414,7 +12573,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4
ret void
@@ -12442,6 +12601,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12592,7 +12754,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4
ret void
@@ -12618,6 +12780,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12743,7 +12908,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic
ret void
@@ -12771,6 +12936,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12923,7 +13091,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
ret void
@@ -12950,6 +13118,9 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13099,7 +13270,7 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release
ret void
@@ -13128,6 +13299,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13304,7 +13478,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
ret void
@@ -13333,6 +13507,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13509,7 +13686,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
ret void
@@ -13538,6 +13715,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13704,7 +13884,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -13735,6 +13915,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13929,7 +14112,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -13960,6 +14143,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14154,7 +14340,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -14186,6 +14372,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14370,7 +14559,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
@@ -14404,6 +14593,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14615,7 +14807,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
@@ -14648,6 +14840,9 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14856,7 +15051,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic
@@ -14891,6 +15086,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15126,7 +15324,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
@@ -15161,6 +15359,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15396,7 +15597,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
@@ -15430,6 +15631,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15641,7 +15845,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire
@@ -15675,6 +15879,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15886,7 +16093,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
@@ -15921,6 +16128,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16156,7 +16366,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
@@ -16191,6 +16401,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16426,7 +16639,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
@@ -16461,6 +16674,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16696,7 +16912,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
@@ -16731,6 +16947,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16966,7 +17185,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst
@@ -17001,6 +17220,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17236,7 +17458,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst
@@ -17271,6 +17493,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17506,7 +17731,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst
@@ -17541,6 +17766,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17776,7 +18004,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst
@@ -17811,6 +18039,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18046,7 +18277,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
@@ -18081,6 +18312,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18292,7 +18526,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
@@ -18330,6 +18564,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18557,7 +18794,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
@@ -18596,6 +18833,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18851,7 +19091,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
@@ -18890,6 +19130,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19145,7 +19388,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
@@ -19183,6 +19426,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19414,7 +19660,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire
@@ -19452,6 +19698,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19679,7 +19928,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
@@ -19718,6 +19967,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19973,7 +20225,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
@@ -20012,6 +20264,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20267,7 +20522,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
@@ -20306,6 +20561,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20561,7 +20819,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
@@ -20600,6 +20858,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20855,7 +21116,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst
@@ -20894,6 +21155,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21145,7 +21409,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst
@@ -21184,6 +21448,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21439,7 +21706,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst
@@ -21478,6 +21745,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21733,7 +22003,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst
@@ -21772,6 +22042,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22027,7 +22300,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
@@ -22035,5 +22308,3 @@ entry:
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 9afd2b5183efb..16e55058e4fc8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -36,6 +36,9 @@ define amdgpu_kernel void @global_nontemporal_load_0(
;
; GFX7-LABEL: global_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -186,7 +189,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load i32, ptr addrspace(1) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
@@ -227,6 +230,9 @@ define amdgpu_kernel void @global_nontemporal_load_1(
;
; GFX7-LABEL: global_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -442,7 +448,7 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
@@ -474,6 +480,9 @@ define amdgpu_kernel void @global_nontemporal_store_0(
;
; GFX7-LABEL: global_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -624,7 +633,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
store i32 %val, ptr addrspace(1) %out, !nontemporal !0
@@ -658,6 +667,9 @@ define amdgpu_kernel void @global_nontemporal_store_1(
;
; GFX7-LABEL: global_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -854,7 +866,7 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr addrspace(1) %in, align 4
@@ -891,6 +903,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
;
; GFX7-LABEL: global_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1041,7 +1056,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
@@ -1050,5 +1065,3 @@ entry:
!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 73958d6e2c3d6..8042d38716107 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
;
; GFX7-LABEL: global_singlethread_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -187,7 +190,7 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
;
; GFX7-LABEL: global_singlethread_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -368,7 +374,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -403,6 +409,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
;
; GFX7-LABEL: global_singlethread_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -549,7 +558,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -584,6 +593,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
;
; GFX7-LABEL: global_singlethread_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -730,7 +742,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -758,6 +770,9 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
;
; GFX7-LABEL: global_singlethread_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -884,7 +899,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4
ret void
@@ -911,6 +926,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
;
; GFX7-LABEL: global_singlethread_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1037,7 +1055,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4
ret void
@@ -1064,6 +1082,9 @@ define amdgpu_kernel void @global_singlethread_release_store(
;
; GFX7-LABEL: global_singlethread_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1190,7 +1211,7 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4
ret void
@@ -1217,6 +1238,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
;
; GFX7-LABEL: global_singlethread_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1343,7 +1367,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4
ret void
@@ -1369,6 +1393,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
;
; GFX7-LABEL: global_singlethread_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1494,7 +1521,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic
ret void
@@ -1520,6 +1547,9 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1645,7 +1675,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
ret void
@@ -1671,6 +1701,9 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
;
; GFX7-LABEL: global_singlethread_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1796,7 +1829,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release
ret void
@@ -1822,6 +1855,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1947,7 +1983,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
ret void
@@ -1973,6 +2009,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2098,7 +2137,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
ret void
@@ -2126,6 +2165,9 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2277,7 +2319,7 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2306,6 +2348,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2457,7 +2502,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2486,6 +2531,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2637,7 +2685,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2669,6 +2717,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2853,7 +2904,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
@@ -2885,6 +2936,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3069,7 +3123,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
@@ -3101,6 +3155,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3285,7 +3342,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
@@ -3317,6 +3374,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3501,7 +3561,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
@@ -3533,6 +3593,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3717,7 +3780,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
@@ -3749,6 +3812,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3933,7 +3999,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
@@ -3965,6 +4031,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4149,7 +4218,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
@@ -4181,6 +4250,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4365,7 +4437,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
@@ -4397,6 +4469,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4581,7 +4656,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
@@ -4613,6 +4688,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4797,7 +4875,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
@@ -4829,6 +4907,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5013,7 +5094,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
@@ -5045,6 +5126,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5229,7 +5313,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
@@ -5261,6 +5345,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5445,7 +5532,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
@@ -5477,6 +5564,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5661,7 +5751,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
@@ -5693,6 +5783,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5877,7 +5970,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
@@ -5912,6 +6005,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6123,7 +6219,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
@@ -6160,6 +6256,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6371,7 +6470,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
@@ -6408,6 +6507,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6619,7 +6721,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
@@ -6656,6 +6758,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6867,7 +6972,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
@@ -6904,6 +7009,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7115,7 +7223,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
@@ -7152,6 +7260,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7363,7 +7474,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
@@ -7400,6 +7511,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7611,7 +7725,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
@@ -7648,6 +7762,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7859,7 +7976,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
@@ -7896,6 +8013,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8107,7 +8227,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
@@ -8144,6 +8264,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8355,7 +8478,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
@@ -8392,6 +8515,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8603,7 +8729,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
@@ -8640,6 +8766,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8851,7 +8980,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
@@ -8888,6 +9017,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9099,7 +9231,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
@@ -9136,6 +9268,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9347,7 +9482,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
@@ -9384,6 +9519,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9595,7 +9733,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
@@ -9632,6 +9770,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
;
; GFX7-LABEL: global_singlethread_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9778,7 +9919,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -9813,6 +9954,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9959,7 +10103,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -9994,6 +10138,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10140,7 +10287,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10175,6 +10322,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10321,7 +10471,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10349,6 +10499,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
;
; GFX7-LABEL: global_singlethread_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10475,7 +10628,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4
ret void
@@ -10502,6 +10655,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10628,7 +10784,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4
ret void
@@ -10655,6 +10811,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
;
; GFX7-LABEL: global_singlethread_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10781,7 +10940,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4
ret void
@@ -10808,6 +10967,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10934,7 +11096,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4
ret void
@@ -10960,6 +11122,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11085,7 +11250,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic
ret void
@@ -11111,6 +11276,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11236,7 +11404,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
ret void
@@ -11262,6 +11430,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11387,7 +11558,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release
ret void
@@ -11413,6 +11584,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11538,7 +11712,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
ret void
@@ -11564,6 +11738,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11689,7 +11866,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
ret void
@@ -11717,6 +11894,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11868,7 +12048,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -11897,6 +12077,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12048,7 +12231,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12077,6 +12260,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12228,7 +12414,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12260,6 +12446,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12444,7 +12633,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
@@ -12476,6 +12665,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12660,7 +12852,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
@@ -12692,6 +12884,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12876,7 +13071,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
@@ -12908,6 +13103,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13092,7 +13290,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
@@ -13124,6 +13322,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13308,7 +13509,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
@@ -13340,6 +13541,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13524,7 +13728,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
@@ -13556,6 +13760,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13740,7 +13947,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
@@ -13772,6 +13979,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13956,7 +14166,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
@@ -13988,6 +14198,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14172,7 +14385,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
@@ -14204,6 +14417,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14388,7 +14604,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
@@ -14420,6 +14636,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14604,7 +14823,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
@@ -14636,6 +14855,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14820,7 +15042,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
@@ -14852,6 +15074,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15036,7 +15261,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
@@ -15068,6 +15293,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15252,7 +15480,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
@@ -15284,6 +15512,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15468,7 +15699,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
@@ -15503,6 +15734,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15714,7 +15948,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
@@ -15751,6 +15985,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15962,7 +16199,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
@@ -15999,6 +16236,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16210,7 +16450,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
@@ -16247,6 +16487,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16458,7 +16701,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
@@ -16495,6 +16738,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16706,7 +16952,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
@@ -16743,6 +16989,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16954,7 +17203,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
@@ -16991,6 +17240,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17202,7 +17454,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
@@ -17239,6 +17491,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17450,7 +17705,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
@@ -17487,6 +17742,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17698,7 +17956,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
@@ -17735,6 +17993,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17946,7 +18207,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
@@ -17983,6 +18244,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18194,7 +18458,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
@@ -18231,6 +18495,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18442,7 +18709,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
@@ -18479,6 +18746,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18690,7 +18960,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
@@ -18727,6 +18997,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18938,7 +19211,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
@@ -18975,6 +19248,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19186,7 +19462,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
@@ -19194,5 +19470,3 @@ entry:
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 7d98eeaad7998..9c11781da56f2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_system_unordered_load(
;
; GFX7-LABEL: global_system_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -187,7 +190,7 @@ define amdgpu_kernel void @global_system_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_system_monotonic_load(
;
; GFX7-LABEL: global_system_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -368,7 +374,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -404,6 +410,9 @@ define amdgpu_kernel void @global_system_acquire_load(
;
; GFX7-LABEL: global_system_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -567,7 +576,7 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -604,6 +613,9 @@ define amdgpu_kernel void @global_system_seq_cst_load(
;
; GFX7-LABEL: global_system_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -785,7 +797,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -813,6 +825,9 @@ define amdgpu_kernel void @global_system_unordered_store(
;
; GFX7-LABEL: global_system_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -939,7 +954,7 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out unordered, align 4
ret void
@@ -966,6 +981,9 @@ define amdgpu_kernel void @global_system_monotonic_store(
;
; GFX7-LABEL: global_system_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1092,7 +1110,7 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4
ret void
@@ -1120,6 +1138,9 @@ define amdgpu_kernel void @global_system_release_store(
;
; GFX7-LABEL: global_system_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1274,7 +1295,7 @@ define amdgpu_kernel void @global_system_release_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out release, align 4
ret void
@@ -1302,6 +1323,9 @@ define amdgpu_kernel void @global_system_seq_cst_store(
;
; GFX7-LABEL: global_system_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1456,7 +1480,7 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4
ret void
@@ -1482,6 +1506,9 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
;
; GFX7-LABEL: global_system_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1607,7 +1634,7 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic
ret void
@@ -1635,6 +1662,9 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
;
; GFX7-LABEL: global_system_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1789,7 +1819,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
ret void
@@ -1816,6 +1846,9 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
;
; GFX7-LABEL: global_system_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1969,7 +2002,7 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release
ret void
@@ -1998,6 +2031,9 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_system_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2180,7 +2216,7 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
ret void
@@ -2209,6 +2245,9 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_system_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2391,7 +2430,7 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
ret void
@@ -2420,6 +2459,9 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2588,7 +2630,7 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2619,6 +2661,9 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2819,7 +2864,7 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2850,6 +2895,9 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -3050,7 +3098,7 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -3082,6 +3130,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3266,7 +3317,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic monotonic
@@ -3300,6 +3351,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3513,7 +3567,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire monotonic
@@ -3546,6 +3600,9 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3758,7 +3815,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in release monotonic
@@ -3793,6 +3850,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4034,7 +4094,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel monotonic
@@ -4069,6 +4129,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4310,7 +4373,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst monotonic
@@ -4344,6 +4407,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4557,7 +4623,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic acquire
@@ -4591,6 +4657,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4804,7 +4873,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire acquire
@@ -4839,6 +4908,9 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5080,7 +5152,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in release acquire
@@ -5115,6 +5187,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5356,7 +5431,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel acquire
@@ -5391,6 +5466,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5632,7 +5710,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst acquire
@@ -5667,6 +5745,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5908,7 +5989,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -5943,6 +6024,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6154,7 +6238,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic monotonic
@@ -6192,6 +6276,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6421,7 +6508,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire monotonic
@@ -6460,6 +6547,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6721,7 +6811,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel monotonic
@@ -6760,6 +6850,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7021,7 +7114,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst monotonic
@@ -7059,6 +7152,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7292,7 +7388,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic acquire
@@ -7330,6 +7426,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7559,7 +7658,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire acquire
@@ -7598,6 +7697,9 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7859,7 +7961,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in release acquire
@@ -7898,6 +8000,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8159,7 +8264,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel acquire
@@ -8198,6 +8303,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8459,7 +8567,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst acquire
@@ -8498,6 +8606,9 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8759,7 +8870,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic seq_cst
@@ -8798,6 +8909,9 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9055,7 +9169,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire seq_cst
@@ -9094,6 +9208,9 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9355,7 +9472,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in release seq_cst
@@ -9394,6 +9511,9 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9655,7 +9775,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel seq_cst
@@ -9694,6 +9814,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9955,7 +10078,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -9992,6 +10115,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
;
; GFX7-LABEL: global_system_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10138,7 +10264,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10173,6 +10299,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
;
; GFX7-LABEL: global_system_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10319,7 +10448,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10355,6 +10484,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
;
; GFX7-LABEL: global_system_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10518,7 +10650,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10555,6 +10687,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
;
; GFX7-LABEL: global_system_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10736,7 +10871,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10764,6 +10899,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
;
; GFX7-LABEL: global_system_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10890,7 +11028,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4
ret void
@@ -10917,6 +11055,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
;
; GFX7-LABEL: global_system_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11043,7 +11184,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4
ret void
@@ -11071,6 +11212,9 @@ define amdgpu_kernel void @global_system_one_as_release_store(
;
; GFX7-LABEL: global_system_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11225,7 +11369,7 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4
ret void
@@ -11253,6 +11397,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
;
; GFX7-LABEL: global_system_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11407,7 +11554,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4
ret void
@@ -11433,6 +11580,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11558,7 +11708,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic
ret void
@@ -11586,6 +11736,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11740,7 +11893,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
ret void
@@ -11767,6 +11920,9 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11920,7 +12076,7 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release
ret void
@@ -11949,6 +12105,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12131,7 +12290,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
ret void
@@ -12160,6 +12319,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12342,7 +12504,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
ret void
@@ -12371,6 +12533,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12539,7 +12704,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12570,6 +12735,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12770,7 +12938,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12801,6 +12969,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13001,7 +13172,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -13033,6 +13204,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13217,7 +13391,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
@@ -13251,6 +13425,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13464,7 +13641,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
@@ -13497,6 +13674,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13709,7 +13889,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
@@ -13744,6 +13924,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13985,7 +14168,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
@@ -14020,6 +14203,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14261,7 +14447,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
@@ -14295,6 +14481,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14508,7 +14697,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
@@ -14542,6 +14731,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14755,7 +14947,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
@@ -14790,6 +14982,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15031,7 +15226,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release acquire
@@ -15066,6 +15261,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15307,7 +15505,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
@@ -15342,6 +15540,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15583,7 +15784,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
@@ -15618,6 +15819,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15859,7 +16063,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
@@ -15894,6 +16098,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16135,7 +16342,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
@@ -16170,6 +16377,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16411,7 +16621,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
@@ -16446,6 +16656,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16687,7 +16900,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
@@ -16722,6 +16935,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16963,7 +17179,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
@@ -16998,6 +17214,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17209,7 +17428,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
@@ -17247,6 +17466,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17476,7 +17698,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
@@ -17514,6 +17736,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17753,7 +17978,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
@@ -17792,6 +18017,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18053,7 +18281,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
@@ -18092,6 +18320,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18353,7 +18584,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
@@ -18391,6 +18622,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18624,7 +18858,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
@@ -18662,6 +18896,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18891,7 +19128,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
@@ -18930,6 +19167,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19191,7 +19431,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release acquire
@@ -19230,6 +19470,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19491,7 +19734,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
@@ -19530,6 +19773,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19791,7 +20037,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
@@ -19830,6 +20076,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20091,7 +20340,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
@@ -20130,6 +20379,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20387,7 +20639,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
@@ -20426,6 +20678,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20687,7 +20942,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
@@ -20726,6 +20981,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20987,7 +21245,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
@@ -21026,6 +21284,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21287,7 +21548,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
@@ -21295,5 +21556,3 @@ entry:
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 66a8a9a0ac569..8a5c5dda9f79c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -37,6 +37,9 @@ define amdgpu_kernel void @global_volatile_load_0(
;
; GFX7-LABEL: global_volatile_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -143,7 +146,7 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4
store i32 %val, ptr addrspace(1) %out
@@ -184,6 +187,9 @@ define amdgpu_kernel void @global_volatile_load_1(
;
; GFX7-LABEL: global_volatile_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -339,7 +345,7 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
@@ -372,6 +378,9 @@ define amdgpu_kernel void @global_volatile_store_0(
;
; GFX7-LABEL: global_volatile_store_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -492,7 +501,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
store volatile i32 %val, ptr addrspace(1) %out
@@ -527,6 +536,9 @@ define amdgpu_kernel void @global_volatile_store_1(
;
; GFX7-LABEL: global_volatile_store_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -681,7 +693,7 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr addrspace(1) %in, align 4
@@ -718,6 +730,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
;
; GFX7-LABEL: global_volatile_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -823,7 +838,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic volatile i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -852,6 +867,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
;
; GFX7-LABEL: global_volatile_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -951,11 +969,10 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index ecd584fd00e3b..151ba07a0b531 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
;
; GFX7-LABEL: global_wavefront_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -187,7 +190,7 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
;
; GFX7-LABEL: global_wavefront_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -368,7 +374,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -403,6 +409,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
;
; GFX7-LABEL: global_wavefront_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -549,7 +558,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -584,6 +593,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
;
; GFX7-LABEL: global_wavefront_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -730,7 +742,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -758,6 +770,9 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
;
; GFX7-LABEL: global_wavefront_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -884,7 +899,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4
ret void
@@ -911,6 +926,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
;
; GFX7-LABEL: global_wavefront_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1037,7 +1055,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4
ret void
@@ -1064,6 +1082,9 @@ define amdgpu_kernel void @global_wavefront_release_store(
;
; GFX7-LABEL: global_wavefront_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1190,7 +1211,7 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4
ret void
@@ -1217,6 +1238,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
;
; GFX7-LABEL: global_wavefront_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1343,7 +1367,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4
ret void
@@ -1369,6 +1393,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
;
; GFX7-LABEL: global_wavefront_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1494,7 +1521,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic
ret void
@@ -1520,6 +1547,9 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1645,7 +1675,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
ret void
@@ -1671,6 +1701,9 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
;
; GFX7-LABEL: global_wavefront_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1796,7 +1829,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release
ret void
@@ -1822,6 +1855,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1947,7 +1983,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
ret void
@@ -1973,6 +2009,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2098,7 +2137,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
ret void
@@ -2126,6 +2165,9 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2277,7 +2319,7 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2306,6 +2348,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2457,7 +2502,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2486,6 +2531,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2637,7 +2685,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2669,6 +2717,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2853,7 +2904,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
@@ -2885,6 +2936,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3069,7 +3123,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
@@ -3101,6 +3155,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3285,7 +3342,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
@@ -3317,6 +3374,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3501,7 +3561,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
@@ -3533,6 +3593,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3717,7 +3780,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
@@ -3749,6 +3812,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3933,7 +3999,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
@@ -3965,6 +4031,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4149,7 +4218,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
@@ -4181,6 +4250,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4365,7 +4437,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
@@ -4397,6 +4469,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4581,7 +4656,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
@@ -4613,6 +4688,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4797,7 +4875,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
@@ -4829,6 +4907,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5013,7 +5094,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
@@ -5045,6 +5126,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5229,7 +5313,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
@@ -5261,6 +5345,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5445,7 +5532,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
@@ -5477,6 +5564,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5661,7 +5751,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
@@ -5693,6 +5783,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5877,7 +5970,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
@@ -5912,6 +6005,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6123,7 +6219,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
@@ -6160,6 +6256,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6371,7 +6470,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
@@ -6408,6 +6507,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6619,7 +6721,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
@@ -6656,6 +6758,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6867,7 +6972,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
@@ -6904,6 +7009,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7115,7 +7223,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
@@ -7152,6 +7260,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7363,7 +7474,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
@@ -7400,6 +7511,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7611,7 +7725,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
@@ -7648,6 +7762,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7859,7 +7976,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
@@ -7896,6 +8013,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8107,7 +8227,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
@@ -8144,6 +8264,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8355,7 +8478,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
@@ -8392,6 +8515,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8603,7 +8729,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
@@ -8640,6 +8766,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8851,7 +8980,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
@@ -8888,6 +9017,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9099,7 +9231,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
@@ -9136,6 +9268,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9347,7 +9482,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
@@ -9384,6 +9519,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9595,7 +9733,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
@@ -9632,6 +9770,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
;
; GFX7-LABEL: global_wavefront_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9778,7 +9919,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -9813,6 +9954,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9959,7 +10103,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -9994,6 +10138,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10140,7 +10287,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10175,6 +10322,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10321,7 +10471,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10349,6 +10499,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
;
; GFX7-LABEL: global_wavefront_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10475,7 +10628,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4
ret void
@@ -10502,6 +10655,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10628,7 +10784,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4
ret void
@@ -10655,6 +10811,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
;
; GFX7-LABEL: global_wavefront_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10781,7 +10940,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4
ret void
@@ -10808,6 +10967,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10934,7 +11096,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4
ret void
@@ -10960,6 +11122,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11085,7 +11250,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic
ret void
@@ -11111,6 +11276,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11236,7 +11404,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
ret void
@@ -11262,6 +11430,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11387,7 +11558,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release
ret void
@@ -11413,6 +11584,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11538,7 +11712,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
ret void
@@ -11564,6 +11738,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11689,7 +11866,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
ret void
@@ -11717,6 +11894,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11868,7 +12048,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -11897,6 +12077,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12048,7 +12231,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12077,6 +12260,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12228,7 +12414,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12260,6 +12446,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12444,7 +12633,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
@@ -12476,6 +12665,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12660,7 +12852,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
@@ -12692,6 +12884,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12876,7 +13071,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
@@ -12908,6 +13103,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13092,7 +13290,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
@@ -13124,6 +13322,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13308,7 +13509,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
@@ -13340,6 +13541,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13524,7 +13728,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
@@ -13556,6 +13760,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13740,7 +13947,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
@@ -13772,6 +13979,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13956,7 +14166,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
@@ -13988,6 +14198,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14172,7 +14385,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
@@ -14204,6 +14417,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14388,7 +14604,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
@@ -14420,6 +14636,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14604,7 +14823,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
@@ -14636,6 +14855,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14820,7 +15042,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
@@ -14852,6 +15074,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15036,7 +15261,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
@@ -15068,6 +15293,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15252,7 +15480,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
@@ -15284,6 +15512,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15468,7 +15699,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
@@ -15503,6 +15734,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15714,7 +15948,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
@@ -15751,6 +15985,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15962,7 +16199,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
@@ -15999,6 +16236,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16210,7 +16450,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
@@ -16247,6 +16487,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16458,7 +16701,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
@@ -16495,6 +16738,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16706,7 +16952,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
@@ -16743,6 +16989,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16954,7 +17203,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
@@ -16991,6 +17240,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17202,7 +17454,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
@@ -17239,6 +17491,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17450,7 +17705,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
@@ -17487,6 +17742,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17698,7 +17956,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
@@ -17735,6 +17993,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17946,7 +18207,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
@@ -17983,6 +18244,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18194,7 +18458,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
@@ -18231,6 +18495,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18442,7 +18709,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
@@ -18479,6 +18746,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18690,7 +18960,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
@@ -18727,6 +18997,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18938,7 +19211,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
@@ -18975,6 +19248,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19186,7 +19462,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
@@ -19194,5 +19470,3 @@ entry:
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index e32256a85a809..69b0c7f93ab0e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
;
; GFX7-LABEL: global_workgroup_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -187,7 +190,7 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
;
; GFX7-LABEL: global_workgroup_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -368,7 +374,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -403,6 +409,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
;
; GFX7-LABEL: global_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -554,7 +563,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -590,6 +599,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
;
; GFX7-LABEL: global_workgroup_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -752,7 +764,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -780,6 +792,9 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
;
; GFX7-LABEL: global_workgroup_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -906,7 +921,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4
ret void
@@ -933,6 +948,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
;
; GFX7-LABEL: global_workgroup_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1059,7 +1077,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4
ret void
@@ -1087,6 +1105,9 @@ define amdgpu_kernel void @global_workgroup_release_store(
;
; GFX7-LABEL: global_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1230,7 +1251,7 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
ret void
@@ -1258,6 +1279,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
;
; GFX7-LABEL: global_workgroup_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1401,7 +1425,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4
ret void
@@ -1427,6 +1451,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
;
; GFX7-LABEL: global_workgroup_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1552,7 +1579,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic
ret void
@@ -1578,6 +1605,9 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1713,7 +1743,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
ret void
@@ -1740,6 +1770,9 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
;
; GFX7-LABEL: global_workgroup_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1882,7 +1915,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release
ret void
@@ -1909,6 +1942,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2061,7 +2097,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
ret void
@@ -2088,6 +2124,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2240,7 +2279,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
ret void
@@ -2268,6 +2307,9 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2424,7 +2466,7 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2454,6 +2496,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2629,7 +2674,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2659,6 +2704,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2834,7 +2882,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -2866,6 +2914,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3050,7 +3101,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
@@ -3082,6 +3133,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3276,7 +3330,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
@@ -3309,6 +3363,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3510,7 +3567,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
@@ -3543,6 +3600,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3754,7 +3814,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
@@ -3787,6 +3847,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3998,7 +4061,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
@@ -4030,6 +4093,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4224,7 +4290,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
@@ -4256,6 +4322,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4450,7 +4519,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
@@ -4483,6 +4552,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4694,7 +4766,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
@@ -4727,6 +4799,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4938,7 +5013,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
@@ -4971,6 +5046,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5182,7 +5260,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
@@ -5215,6 +5293,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5426,7 +5507,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst
@@ -5459,6 +5540,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5670,7 +5754,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst
@@ -5703,6 +5787,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5914,7 +6001,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst
@@ -5947,6 +6034,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6158,7 +6248,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst
@@ -6191,6 +6281,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6402,7 +6495,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
@@ -6437,6 +6530,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6648,7 +6744,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
@@ -6685,6 +6781,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6901,7 +7000,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
@@ -6939,6 +7038,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7167,7 +7269,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
@@ -7205,6 +7307,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7440,7 +7545,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
@@ -7478,6 +7583,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7713,7 +7821,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
@@ -7750,6 +7858,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7968,7 +8079,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
@@ -8005,6 +8116,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8221,7 +8335,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
@@ -8259,6 +8373,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8494,7 +8611,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
@@ -8532,6 +8649,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8767,7 +8887,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
@@ -8805,6 +8925,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9040,7 +9163,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
@@ -9078,6 +9201,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9313,7 +9439,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst
@@ -9351,6 +9477,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9584,7 +9713,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst
@@ -9622,6 +9751,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9857,7 +9989,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst
@@ -9895,6 +10027,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10130,7 +10265,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst
@@ -10168,6 +10303,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10403,7 +10541,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
@@ -10440,6 +10578,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
;
; GFX7-LABEL: global_workgroup_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10586,7 +10727,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") unordered, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10621,6 +10762,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10767,7 +10911,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10802,6 +10946,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10953,7 +11100,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") acquire, align 4
store i32 %val, ptr addrspace(1) %out
@@ -10988,6 +11135,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11147,7 +11297,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
@@ -11175,6 +11325,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
;
; GFX7-LABEL: global_workgroup_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11301,7 +11454,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4
ret void
@@ -11328,6 +11481,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11454,7 +11610,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4
ret void
@@ -11481,6 +11637,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
;
; GFX7-LABEL: global_workgroup_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11617,7 +11776,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4
ret void
@@ -11644,6 +11803,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11780,7 +11942,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(1) %out) #0 {
+ i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4
ret void
@@ -11806,6 +11968,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11931,7 +12096,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic
ret void
@@ -11957,6 +12122,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12092,7 +12260,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
ret void
@@ -12118,6 +12286,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12253,7 +12424,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release
ret void
@@ -12279,6 +12450,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12424,7 +12598,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
ret void
@@ -12450,6 +12624,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12595,7 +12772,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
ret void
@@ -12623,6 +12800,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12779,7 +12959,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
store i32 %val, ptr addrspace(1) %out, align 4
@@ -12808,6 +12988,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12976,7 +13159,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
@@ -13005,6 +13188,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13173,7 +13359,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in) #0 {
+ ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
@@ -13205,6 +13391,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13389,7 +13578,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
@@ -13421,6 +13610,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13615,7 +13807,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
@@ -13647,6 +13839,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13841,7 +14036,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
@@ -13873,6 +14068,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14077,7 +14275,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
@@ -14109,6 +14307,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14313,7 +14514,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
@@ -14345,6 +14546,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14539,7 +14743,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
@@ -14571,6 +14775,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14765,7 +14972,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
@@ -14797,6 +15004,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15001,7 +15211,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
@@ -15033,6 +15243,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15237,7 +15450,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
@@ -15269,6 +15482,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15473,7 +15689,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
@@ -15505,6 +15721,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15709,7 +15928,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
@@ -15741,6 +15960,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15945,7 +16167,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
@@ -15977,6 +16199,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16181,7 +16406,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
@@ -16213,6 +16438,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16417,7 +16645,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
@@ -16449,6 +16677,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16653,7 +16884,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
@@ -16688,6 +16919,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16899,7 +17133,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
@@ -16936,6 +17170,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17152,7 +17389,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
@@ -17189,6 +17426,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17410,7 +17650,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
@@ -17447,6 +17687,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17675,7 +17918,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
@@ -17712,6 +17955,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17940,7 +18186,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
@@ -17977,6 +18223,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18195,7 +18444,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
@@ -18232,6 +18481,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18448,7 +18700,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
@@ -18485,6 +18737,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18713,7 +18968,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
@@ -18750,6 +19005,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18978,7 +19236,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
@@ -19015,6 +19273,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19243,7 +19504,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
@@ -19280,6 +19541,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19508,7 +19772,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
@@ -19545,6 +19809,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19771,7 +20038,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
@@ -19808,6 +20075,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20036,7 +20306,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
@@ -20073,6 +20343,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20301,7 +20574,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
@@ -20338,6 +20611,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20566,7 +20842,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %out, i32 %in, i32 %old) #0 {
+ ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
@@ -20574,5 +20850,3 @@ entry:
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 7850b4dfd0ca0..78209ee34cad4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -38,6 +38,9 @@ define amdgpu_kernel void @local_nontemporal_load_0(
;
; GFX7-LABEL: local_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
@@ -190,7 +193,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(3) %in, ptr addrspace(1) %out) {
entry:
%val = load i32, ptr addrspace(3) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
@@ -224,6 +227,9 @@ define amdgpu_kernel void @local_nontemporal_load_1(
;
; GFX7-LABEL: local_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 s7, 2
@@ -422,7 +428,7 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(3) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(3) %in, i32 %tid
@@ -591,7 +597,7 @@ define amdgpu_kernel void @local_nontemporal_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(3) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(3) %out) {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
store i32 %val, ptr addrspace(3) %out, !nontemporal !0
@@ -796,7 +802,7 @@ define amdgpu_kernel void @local_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(3) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(3) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr addrspace(1) %in, align 4
@@ -830,6 +836,9 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
;
; GFX7-LABEL: local_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
@@ -982,7 +991,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(3) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(3) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
@@ -991,4 +1000,3 @@ entry:
!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 39293f6b267a8..bc2508411ed6b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -34,6 +34,9 @@ define amdgpu_kernel void @local_volatile_load_0(
;
; GFX7-LABEL: local_volatile_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
@@ -138,7 +141,7 @@ define amdgpu_kernel void @local_volatile_load_0(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(3) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(3) %in, align 4
store i32 %val, ptr addrspace(1) %out
@@ -172,6 +175,9 @@ define amdgpu_kernel void @local_volatile_load_1(
;
; GFX7-LABEL: local_volatile_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 s7, 2
@@ -302,7 +308,7 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(3) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(3) %in, i32 %tid
@@ -433,7 +439,7 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(3) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(3) %out) {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
store volatile i32 %val, ptr addrspace(3) %out
@@ -584,7 +590,7 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(3) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(3) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr addrspace(1) %in, align 4
@@ -712,7 +718,7 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(3) %in, ptr addrspace(3) %out) #0 {
+ ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic volatile i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4
store i32 %val, ptr addrspace(3) %out
@@ -827,11 +833,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
- i32 %in, ptr addrspace(3) %out) #0 {
+ i32 %in, ptr addrspace(3) %out) {
entry:
store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 29dfce7b682de..2aa4f021c259c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -38,7 +38,10 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX7-LABEL: private_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -53,7 +56,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX10-WGP-LABEL: private_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -67,7 +70,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX10-CU-LABEL: private_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -107,7 +110,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -121,7 +124,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -198,7 +201,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(5) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(5) %in, ptr addrspace(1) %out) {
entry:
%val = load i32, ptr addrspace(5) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
@@ -232,7 +235,10 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX7-LABEL: private_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -249,7 +255,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX10-WGP-LABEL: private_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -265,7 +271,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX10-CU-LABEL: private_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -309,7 +315,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -328,7 +334,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -444,7 +450,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(5) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(5) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid
@@ -470,7 +476,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX7-LABEL: private_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
@@ -484,7 +490,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX10-WGP-LABEL: private_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -498,7 +504,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX10-CU-LABEL: private_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -530,7 +536,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -544,7 +550,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -621,7 +627,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(5) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(5) %out) {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
store i32 %val, ptr addrspace(5) %out, !nontemporal !0
@@ -647,7 +653,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX7-LABEL: private_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
@@ -663,7 +669,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX10-WGP-LABEL: private_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -678,7 +684,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX10-CU-LABEL: private_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -713,7 +719,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -731,7 +737,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -840,7 +846,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: scratch_store_b32 v1, v0, s0 th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(5) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(5) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr addrspace(1) %in, align 4
@@ -874,7 +880,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX7-LABEL: private_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -889,7 +898,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -903,7 +912,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX10-CU-LABEL: private_nontemporal_volatile_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -943,7 +952,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -957,7 +966,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -1038,7 +1047,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(5) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(5) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(5) %in, align 4, !nontemporal !0
store i32 %val, ptr addrspace(1) %out
@@ -1047,4 +1056,3 @@ entry:
!0 = !{i32 1}
declare i32 @llvm.amdgcn.workitem.id.x()
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index 77a93f2156543..df4193969f8a0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -38,7 +38,10 @@ define amdgpu_kernel void @private_volatile_load_0(
;
; GFX7-LABEL: private_volatile_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -53,7 +56,7 @@ define amdgpu_kernel void @private_volatile_load_0(
;
; GFX10-WGP-LABEL: private_volatile_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -67,7 +70,7 @@ define amdgpu_kernel void @private_volatile_load_0(
;
; GFX10-CU-LABEL: private_volatile_load_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -152,7 +155,7 @@ define amdgpu_kernel void @private_volatile_load_0(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(5) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(5) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(5) %in, align 4
store i32 %val, ptr addrspace(1) %out
@@ -190,7 +193,10 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX7-LABEL: private_volatile_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -207,7 +213,7 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX10-WGP-LABEL: private_volatile_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -223,7 +229,7 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX10-CU-LABEL: private_volatile_load_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -334,7 +340,7 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(5) %in, ptr addrspace(1) %out) #0 {
+ ptr addrspace(5) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid
@@ -365,7 +371,7 @@ define amdgpu_kernel void @private_volatile_store_0(
;
; GFX7-LABEL: private_volatile_store_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
@@ -380,7 +386,7 @@ define amdgpu_kernel void @private_volatile_store_0(
;
; GFX10-WGP-LABEL: private_volatile_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -395,7 +401,7 @@ define amdgpu_kernel void @private_volatile_store_0(
;
; GFX10-CU-LABEL: private_volatile_store_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -484,7 +490,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(5) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(5) %out) {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
store volatile i32 %val, ptr addrspace(5) %out
@@ -515,7 +521,7 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX7-LABEL: private_volatile_store_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
@@ -532,7 +538,7 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX10-WGP-LABEL: private_volatile_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -548,7 +554,7 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX10-CU-LABEL: private_volatile_store_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -658,7 +664,7 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX12-CU-NEXT: scratch_store_b32 v1, v0, s0 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(5) %out) #0 {
+ ptr addrspace(1) %in, ptr addrspace(5) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, ptr addrspace(1) %in, align 4
@@ -668,4 +674,3 @@ entry:
}
declare i32 @llvm.amdgcn.workitem.id.x()
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index aa562d7328824..07072f6a36296 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -34,10 +34,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -56,10 +59,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -144,6 +150,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_imin_sle_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -155,6 +164,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_imin_sle_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -214,6 +226,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
; CI-LABEL: s_test_imin_sle_v1i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -225,6 +240,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
; VI-LABEL: s_test_imin_sle_v1i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -288,6 +306,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s3, s3, s7
; CI-NEXT: s_min_i32 s2, s2, s6
@@ -306,6 +327,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s3, s3, s7
; VI-NEXT: s_min_i32 s2, s2, s6
@@ -414,11 +438,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i8 s2, s2
; CI-NEXT: s_sext_i32_i8 s3, s3
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_byte v[0:1], v2
@@ -429,11 +456,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i8 s2, s2
; VI-NEXT: s_sext_i32_i8 s3, s3
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -549,6 +579,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s4, s2, 24
; CI-NEXT: s_sext_i32_i8 s5, s2
@@ -572,6 +604,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; CI-NEXT: s_and_b32 s3, s3, 0xffff
; CI-NEXT: s_or_b32 s2, s3, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -582,6 +615,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s4, s2, 24
; VI-NEXT: s_bfe_i32 s5, s2, 0x80010
@@ -605,6 +640,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_or_b32 s2, s2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -757,6 +793,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
; CI-LABEL: s_test_imin_sle_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s4, s2, 16
; CI-NEXT: s_sext_i32_i16 s2, s2
@@ -776,6 +815,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
; VI-LABEL: s_test_imin_sle_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s4, s2, 16
; VI-NEXT: s_sext_i32_i16 s2, s2
@@ -857,6 +899,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s6, s0, 16
; CI-NEXT: s_ashr_i32 s7, s1, 16
@@ -887,6 +932,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s6, s1, 16
; VI-NEXT: s_sext_i32_i16 s1, s1
@@ -983,10 +1031,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1005,10 +1056,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1122,10 +1176,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1144,10 +1201,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1233,6 +1293,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_imin_slt_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1244,6 +1307,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_imin_slt_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1305,6 +1371,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s1, s1, s3
; CI-NEXT: s_min_i32 s0, s0, s2
@@ -1319,6 +1388,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s1, s1, s3
; VI-NEXT: s_min_i32 s0, s0, s2
@@ -1391,6 +1463,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, 8
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1403,6 +1478,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1468,6 +1546,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, 8
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1480,6 +1561,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1557,10 +1641,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1579,10 +1666,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1686,12 +1776,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v2, s5
; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
@@ -1710,12 +1803,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
@@ -1838,12 +1934,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1874,12 +1973,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1976,6 +2078,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_umin_ule_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1987,6 +2092,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_umin_ule_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2059,10 +2167,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2081,10 +2192,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2188,6 +2302,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s3
; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
@@ -2209,6 +2326,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0
@@ -2294,6 +2414,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_umin_ult_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2305,6 +2428,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_umin_ult_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2386,6 +2512,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
; CI-LABEL: v_test_umin_ult_i32_multi_use:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-NEXT: s_load_dword s5, s[6:7], 0x0
@@ -2407,6 +2536,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
; VI-LABEL: v_test_umin_ult_i32_multi_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s4, s[4:5], 0x0
; VI-NEXT: s_load_dword s5, s[6:7], 0x0
@@ -2534,6 +2666,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; CI-LABEL: v_test_umin_ult_i16_multi_use:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
@@ -2556,6 +2691,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; VI-LABEL: v_test_umin_ult_i16_multi_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
@@ -2646,6 +2784,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
; CI-LABEL: s_test_umin_ult_v1i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2657,6 +2798,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
; VI-LABEL: s_test_umin_ult_v1i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2726,6 +2870,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
;
; CI-LABEL: s_test_umin_ult_v8i32:
; CI: ; %bb.0:
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -2757,6 +2904,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
;
; VI-LABEL: s_test_umin_ult_v8i32:
; VI: ; %bb.0:
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -2921,6 +3071,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s10, s0, 16
; CI-NEXT: s_and_b32 s0, s0, 0xffff
@@ -2967,6 +3120,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s10, s3, 16
; VI-NEXT: s_and_b32 s3, s3, 0xffff
@@ -3088,11 +3244,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0xffff
; CI-NEXT: s_and_b32 s3, s3, 0xffff
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -3103,11 +3262,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -3195,11 +3357,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i16 s2, s2
; CI-NEXT: s_sext_i32_i16 s3, s3
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -3210,11 +3375,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: s_sext_i32_i16 s3, s3
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -3309,6 +3477,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i16 s3, s2
; CI-NEXT: s_ashr_i32 s2, s2, 16
@@ -3323,6 +3494,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s3, s2
; VI-NEXT: s_ashr_i32 s2, s2, 16
@@ -3403,6 +3577,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3421,6 +3598,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3510,6 +3690,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3528,6 +3711,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3617,6 +3803,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3635,6 +3824,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3724,6 +3916,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3742,6 +3937,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3855,9 +4053,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_load_dword v4, v[0:1]
@@ -3886,10 +4087,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -4005,9 +4209,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_load_dword v4, v[0:1]
@@ -4035,10 +4242,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -4112,5 +4322,5 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index f31f577e3896b..b1ce5a3423f20 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -180,6 +180,9 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -260,6 +263,9 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -341,6 +347,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -403,6 +412,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -465,6 +477,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -527,6 +542,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -588,6 +606,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -632,5 +653,5 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index 8b95b26f142db..5803821a1d2c0 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -176,6 +176,9 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1)
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -254,6 +257,9 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -333,6 +339,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 {
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -393,6 +402,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -454,6 +466,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 {
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -514,6 +529,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -557,6 +575,6 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index c942426bcc720..4741593cd2270 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -10,32 +10,33 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX908: bb.0 (%ir-block.0):
; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX908-NEXT: {{ $}}
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %6
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %7
- ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %7
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %8
+
+ ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
- ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX908-NEXT: S_ENDPGM 0
;
; PEI-GFX908-LABEL: name: partial_copy
; PEI-GFX908: bb.0 (%ir-block.0):
- ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
+ ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
; PEI-GFX908-NEXT: {{ $}}
- ; PEI-GFX908-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
- ; PEI-GFX908-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
- ; PEI-GFX908-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+ ; PEI-GFX908-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+ ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+ ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
- ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+ ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
@@ -44,7 +45,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+ ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -55,31 +56,31 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX90A: bb.0 (%ir-block.0):
; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX90A-NEXT: {{ $}}
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %6
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %7
- ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %7
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %8
+ ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
- ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX90A-NEXT: S_ENDPGM 0
;
; PEI-GFX90A-LABEL: name: partial_copy
; PEI-GFX90A: bb.0 (%ir-block.0):
- ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
+ ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
; PEI-GFX90A-NEXT: {{ $}}
- ; PEI-GFX90A-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
- ; PEI-GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
- ; PEI-GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+ ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+ ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+ ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
- ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+ ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -87,7 +88,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+ ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
@@ -104,4 +105,4 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
-attributes #0 = { nounwind "amdgpu-num-vgpr"="5" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { nounwind "amdgpu-num-vgpr"="5" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index 71e37bd7ee312..c26f0926d86b2 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -19,16 +19,16 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preload_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB0_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB0_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -54,17 +54,16 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: preload_unused_arg_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB1_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB1_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s12
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -181,7 +180,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: incorrect_type_i64_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB5_0
; GFX90a-NEXT: .p2align 8
@@ -191,7 +190,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i64, ptr addrspace(4) %imp_arg_ptr
@@ -217,7 +216,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: incorrect_type_i16_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB6_0
; GFX90a-NEXT: .p2align 8
@@ -227,7 +226,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i16, ptr addrspace(4) %imp_arg_ptr
@@ -252,16 +251,15 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preload_block_count_y:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB7_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB7_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
@@ -289,7 +287,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: random_incorrect_offset:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB8_0
; GFX90a-NEXT: .p2align 8
@@ -300,7 +298,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
@@ -327,17 +325,16 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preload_block_count_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB9_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB9_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s12
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
@@ -366,19 +363,18 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
;
; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB10_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB10_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
-; GFX90a-NEXT: s_add_i32 s0, s10, s0
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
+; GFX90a-NEXT: s_add_i32 s0, s12, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -408,19 +404,18 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_block_count_xyz:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB11_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB11_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
@@ -454,17 +449,17 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_workgroup_size_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB12_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB12_0:
-; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
@@ -492,17 +487,17 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_workgroup_size_y:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB13_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB13_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
+; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
@@ -531,18 +526,18 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_workgroup_size_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB14_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB14_0:
-; GFX90a-NEXT: s_and_b32 s0, s12, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s14, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
@@ -575,22 +570,22 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
;
; GFX90a-LABEL: preload_workgroup_size_xyz:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB15_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB15_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
-; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff
-; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-NEXT: s_and_b32 s1, s13, 0xffff
+; GFX90a-NEXT: s_and_b32 s2, s14, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
@@ -628,18 +623,18 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-LABEL: preload_remainder_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB16_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB16_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s12, 16
+; GFX90a-NEXT: s_lshr_b32 s0, s14, 16
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
@@ -668,18 +663,16 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-LABEL: preloadremainder_y:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB17_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB17_0:
-; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s15, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
@@ -708,18 +701,16 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-LABEL: preloadremainder_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB18_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB18_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-NEXT: s_lshr_b32 s0, s15, 16
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
@@ -752,22 +743,20 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preloadremainder_xyz:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB19_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB19_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT: s_lshr_b32 s1, s12, 16
-; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s15, 16
+; GFX90a-NEXT: s_lshr_b32 s1, s14, 16
+; GFX90a-NEXT: s_and_b32 s2, s15, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
@@ -844,10 +833,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
;
; GFX90a-LABEL: preload_block_max_user_sgprs:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB21_0
; GFX90a-NEXT: .p2align 8
@@ -857,7 +843,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -887,21 +873,23 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt
;
; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB22_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB22_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff
+; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x1c
+; GFX90a-NEXT: s_and_b32 s1, s14, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: s_lshr_b32 s0, s0, 16
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index fe6378435a42e..7ae0c11dca279 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -21,17 +21,17 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0)
;
; GFX90a-LABEL: ptr1_i8:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB0_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB0_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
@@ -56,17 +56,17 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero
;
; GFX90a-LABEL: ptr1_i8_zext_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB1_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB1_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -91,17 +91,17 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16
;
; GFX90a-LABEL: ptr1_i16_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB2_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB2_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -125,16 +125,16 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32
;
; GFX90a-LABEL: ptr1_i32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB3_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB3_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store i32 %arg0, ptr addrspace(1) %out
ret void
@@ -160,18 +160,17 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa
;
; GFX90a-LABEL: i32_ptr1_i32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB4_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB4_0:
-; GFX90a-NEXT: s_add_i32 s0, s6, s10
+; GFX90a-NEXT: s_add_i32 s0, s8, s12
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[10:11]
; GFX90a-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
@@ -198,19 +197,19 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: ptr1_i16_i16_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB5_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB5_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s8, 16
-; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s10, 16
+; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff
; GFX90a-NEXT: s_add_i32 s0, s1, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
@@ -236,16 +235,16 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2
;
; GFX90a-LABEL: ptr1_v2i8_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB6_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB6_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <2 x i8> %in, ptr addrspace(1) %out
ret void
@@ -274,7 +273,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
;
; GFX90a-LABEL: byref_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB7_0
; GFX90a-NEXT: .p2align 8
@@ -285,9 +284,9 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -320,7 +319,7 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: byref_staggered_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB8_0
; GFX90a-NEXT: .p2align 8
@@ -331,9 +330,9 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -370,26 +369,26 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x
;
; GFX90a-LABEL: v8i32_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB9_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB9_0:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v0, s16
+; GFX90a-NEXT: v_mov_b32_e32 v1, s17
+; GFX90a-NEXT: v_mov_b32_e32 v2, s18
+; GFX90a-NEXT: v_mov_b32_e32 v3, s19
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
+; GFX90a-NEXT: s_nop 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-NEXT: s_nop 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-NEXT: s_endpgm
store <8 x i32> %in, ptr addrspace(1) %out, align 4
ret void
@@ -414,18 +413,17 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-LABEL: v3i16_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB10_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB10_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <3 x i16> %in, ptr addrspace(1) %out, align 4
ret void
@@ -451,19 +449,17 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-LABEL: v3i32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB11_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB11_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
store <3 x i32> %in, ptr addrspace(1) %out, align 4
ret void
@@ -489,19 +485,17 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-LABEL: v3f32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB12_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB12_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
store <3 x float> %in, ptr addrspace(1) %out, align 4
ret void
@@ -533,25 +527,24 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou
;
; GFX90a-LABEL: v5i8_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB13_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB13_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s8, 24
+; GFX90a-NEXT: s_lshr_b32 s1, s10, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s0, s0, s1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_byte v0, v1, s[6:7] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_byte v0, v1, s[8:9] offset:4
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <5 x i8> %in, ptr addrspace(1) %out, align 4
ret void
@@ -587,29 +580,29 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
;
; GFX90a-LABEL: v5f64_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB14_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB14_0:
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x40
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s16
+; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] offset:32
+; GFX90a-NEXT: v_mov_b32_e32 v1, s17
+; GFX90a-NEXT: v_mov_b32_e32 v2, s18
+; GFX90a-NEXT: v_mov_b32_e32 v3, s19
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
+; GFX90a-NEXT: s_nop 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-NEXT: s_nop 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-NEXT: s_endpgm
store <5 x double> %in, ptr addrspace(1) %out, align 8
ret void
@@ -647,31 +640,30 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8
;
; GFX90a-LABEL: v8i8_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB15_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB15_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s9, 24
+; GFX90a-NEXT: s_lshr_b32 s1, s11, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s9, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s2, s11, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_lshr_b32 s2, s8, 24
+; GFX90a-NEXT: s_lshr_b32 s2, s10, 24
; GFX90a-NEXT: s_lshl_b32 s2, s2, 8
-; GFX90a-NEXT: s_bfe_u32 s3, s8, 0x80010
-; GFX90a-NEXT: s_and_b32 s0, s9, 0xffff
+; GFX90a-NEXT: s_bfe_u32 s3, s10, 0x80010
+; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s2, s3, s2
; GFX90a-NEXT: s_or_b32 s0, s0, s1
-; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff
; GFX90a-NEXT: s_lshl_b32 s2, s2, 16
; GFX90a-NEXT: s_or_b32 s1, s1, s2
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
store <8 x i8> %in, ptr addrspace(1) %out
ret void
@@ -694,16 +686,15 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i
;
; GFX90a-LABEL: i64_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB16_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB16_0:
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
store i64 %a, ptr addrspace(1) %out, align 8
ret void
@@ -726,16 +717,15 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d
;
; GFX90a-LABEL: f64_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB17_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB17_0:
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
store double %in, ptr addrspace(1) %out
ret void
@@ -758,16 +748,16 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: half_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB18_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB18_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
ret void
@@ -790,16 +780,16 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out
;
; GFX90a-LABEL: bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB19_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB19_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store bfloat %in, ptr addrspace(1) %out
ret void
@@ -822,16 +812,16 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: v2bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB20_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB20_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <2 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -856,18 +846,17 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: v3bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB21_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB21_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <3 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -893,19 +882,17 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: v6bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB22_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB22_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
store <6 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -934,24 +921,24 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr
;
; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB23_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB23_0:
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: global_store_short v3, v0, s[6:7]
-; GFX90a-NEXT: v_mov_b32_e32 v0, s13
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NEXT: global_store_short v3, v0, s[8:9]
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v0, s3
+; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
store <7 x bfloat> %in2, ptr addrspace(1) %out2
@@ -976,17 +963,17 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1
;
; GFX90a-LABEL: i1_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB24_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB24_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 1
+; GFX90a-NEXT: s_and_b32 s0, s10, 1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_byte v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_byte v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store i1 %in, ptr addrspace(1) %out
ret void
@@ -1013,20 +1000,18 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: fp128_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB25_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB25_0:
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: v_mov_b32_e32 v3, s13
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: v_mov_b32_e32 v3, s15
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-NEXT: s_endpgm
store fp128 %in, ptr addrspace(1) %out
ret void
@@ -1059,26 +1044,25 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: v7i8_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB26_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB26_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s8, 24
+; GFX90a-NEXT: s_lshr_b32 s1, s10, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s0, s0, s1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[6:7] offset:6
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[8:9] offset:6
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <7 x i8> %in, ptr addrspace(1) %out
ret void
@@ -1106,21 +1090,19 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out
;
; GFX90a-LABEL: v7half_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB27_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB27_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s13
-; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s15
+; GFX90a-NEXT: global_store_short v3, v0, s[8:9] offset:12
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
store <7 x half> %in, ptr addrspace(1) %out
ret void
@@ -1145,18 +1127,18 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %ou
;
; GFX90a-LABEL: i16_i32_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB28_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB28_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_dword v0, v1, s[12:13]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i32 %in2, ptr addrspace(1) %out2
@@ -1184,22 +1166,22 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %
;
; GFX90a-LABEL: i16_v3i32_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB29_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB29_0:
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v4, s8
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: global_store_short v3, v4, s[6:7]
+; GFX90a-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
+; GFX90a-NEXT: v_mov_b32_e32 v4, s10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NEXT: global_store_short v3, v4, s[8:9]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <3 x i32> %in2, ptr addrspace(1) %out2
@@ -1224,17 +1206,17 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou
;
; GFX90a-LABEL: i16_i16_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB30_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB30_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[10:11]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[12:13]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i16 %in2, ptr addrspace(1) %out2
@@ -1264,22 +1246,22 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: i16_v2i8_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB31_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB31_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-NEXT: s_lshr_b32 s0, s10, 24
; GFX90a-NEXT: s_lshl_b32 s0, s0, 8
-; GFX90a-NEXT: s_bfe_u32 s1, s8, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s1, s10, 0x80010
; GFX90a-NEXT: s_or_b32 s0, s1, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_short v0, v1, s[10:11]
+; GFX90a-NEXT: global_store_short v0, v1, s[12:13]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <2 x i8> %in2, ptr addrspace(1) %out2
@@ -1308,7 +1290,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
;
; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB32_0
; GFX90a-NEXT: .p2align 8
@@ -1318,7 +1300,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_add_i32 s2, s6, s2
+; GFX90a-NEXT: s_add_i32 s2, s8, s2
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NEXT: s_endpgm
@@ -1345,17 +1327,16 @@ define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: ptr1_i8_trailing_unused:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB33_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB33_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index bc0b81749460f..8f25e6519588b 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -1,11 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -earlycse-debug-hash -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_pat1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
@@ -27,15 +30,18 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b,
ret void
}
-define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) #0 {
+define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) {
; GCN-LABEL: v_sad_u32_constant_pat1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0x5a
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_sad_u32 v2, s2, v0, 20
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -52,11 +58,14 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a
ret void
}
-define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_pat2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
@@ -76,15 +85,17 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
ret void
}
-define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_sub_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_min_u32 s3, s0, s1
; GCN-NEXT: s_max_u32 s0, s0, s1
@@ -92,8 +103,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_add_i32 s0, s0, s2
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
@@ -112,22 +124,25 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
ret void
}
-define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_add_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_mov_b32_e32 v3, s2
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_sad_u32 v2, s0, v2, v3
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -144,24 +159,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i
ret void
}
-define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_max_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_max_u32 s3, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_sad_u32 v3, s0, v0, v1
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_store_dword v[0:1], v3
; GCN-NEXT: s_endpgm
@@ -179,24 +197,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
ret void
}
-define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_min_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_min_u32 s3, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_sad_u32 v3, s0, v0, v1
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_store_dword v[0:1], v3
; GCN-NEXT: s_endpgm
@@ -215,24 +236,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
ret void
}
-define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_sub_pat2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s3, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_sad_u32 v3, s0, v0, v1
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_store_dword v[0:1], v3
; GCN-NEXT: s_endpgm
@@ -248,15 +272,17 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
ret void
}
-define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_select_pat2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_min_u32 s3, s0, s1
; GCN-NEXT: s_max_u32 s0, s0, s1
@@ -264,8 +290,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_add_i32 s0, s0, s2
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
@@ -282,9 +309,12 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; GCN-LABEL: v_sad_u32_vector_pat1:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc
; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
@@ -318,9 +348,12 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32
ret void
}
-define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; GCN-LABEL: v_sad_u32_vector_pat2:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc
; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
@@ -352,12 +385,14 @@ define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32
ret void
}
-define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) #0 {
+define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) {
; GCN-LABEL: v_sad_u32_i16_pat1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s4, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NEXT: s_lshr_b32 s0, s0, 16
@@ -365,6 +400,7 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_sad_u32 v2, s4, v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: flat_store_short v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -381,9 +417,12 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
ret void
}
-define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) #0 {
+define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
; GCN-LABEL: v_sad_u32_i16_pat2:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -411,11 +450,14 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) #0 {
ret void
}
-define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) #0 {
+define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) {
; GCN-LABEL: v_sad_u32_i8_pat1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s3, s2, 0xff
; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008
@@ -440,9 +482,12 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
ret void
}
-define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) #0 {
+define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
; GCN-LABEL: v_sad_u32_i8_pat2:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -470,11 +515,14 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) #0 {
ret void
}
-define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) #0 {
+define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
; GCN-LABEL: s_sad_u32_i8_pat2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s3, s2, 0xff
; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008
@@ -497,11 +545,14 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %
ret void
}
-define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) #0 {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
; GCN-LABEL: v_sad_u32_mismatched_operands_pat1:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_max_u32 s6, s0, s1
; GCN-NEXT: s_cmp_le_u32 s0, s1
@@ -526,11 +577,14 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %
ret void
}
-define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) #0 {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
; GCN-LABEL: v_sad_u32_mismatched_operands_pat2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s3, s0, s3
; GCN-NEXT: s_sub_i32 s6, s1, s0
@@ -553,4 +607,3 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %
ret void
}
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index 19a41a89b6ac6..29448ab2d822e 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -9,6 +9,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v5, s3
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -24,6 +26,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v5, s3
; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -39,6 +43,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX908: ; %bb.0: ; %entry
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, s3
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -55,6 +61,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -88,6 +96,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v5, s3
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -103,6 +113,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v5, s3
; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -118,6 +130,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX908: ; %bb.0: ; %entry
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, s3
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -134,6 +148,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -164,5 +180,5 @@ entry:
declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index f072f68c67ab3..90dfd5a21d107 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -20,179 +20,183 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: ; def s[2:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[4:7]
-; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
; CHECK-NEXT: v_writelane_b32 v22, s2, 0
; CHECK-NEXT: v_writelane_b32 v22, s3, 1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[48:51]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[4:11]
+; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_writelane_b32 v22, s4, 2
; CHECK-NEXT: v_writelane_b32 v22, s5, 3
; CHECK-NEXT: v_writelane_b32 v22, s6, 4
-; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
; CHECK-NEXT: v_writelane_b32 v22, s7, 5
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[4:11]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s4, 6
-; CHECK-NEXT: v_writelane_b32 v22, s5, 7
-; CHECK-NEXT: v_writelane_b32 v22, s6, 8
-; CHECK-NEXT: v_writelane_b32 v22, s7, 9
-; CHECK-NEXT: v_writelane_b32 v22, s8, 10
-; CHECK-NEXT: v_writelane_b32 v22, s9, 11
-; CHECK-NEXT: v_writelane_b32 v22, s10, 12
-; CHECK-NEXT: v_writelane_b32 v22, s11, 13
+; CHECK-NEXT: v_writelane_b32 v22, s8, 6
+; CHECK-NEXT: v_writelane_b32 v22, s9, 7
+; CHECK-NEXT: v_writelane_b32 v22, s10, 8
+; CHECK-NEXT: v_writelane_b32 v22, s11, 9
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[4:19]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s4, 14
-; CHECK-NEXT: v_writelane_b32 v22, s5, 15
-; CHECK-NEXT: v_writelane_b32 v22, s6, 16
-; CHECK-NEXT: v_writelane_b32 v22, s7, 17
-; CHECK-NEXT: v_writelane_b32 v22, s8, 18
-; CHECK-NEXT: v_writelane_b32 v22, s9, 19
-; CHECK-NEXT: v_writelane_b32 v22, s10, 20
-; CHECK-NEXT: v_writelane_b32 v22, s11, 21
-; CHECK-NEXT: v_writelane_b32 v22, s12, 22
-; CHECK-NEXT: v_writelane_b32 v22, s13, 23
-; CHECK-NEXT: v_writelane_b32 v22, s14, 24
-; CHECK-NEXT: v_writelane_b32 v22, s15, 25
-; CHECK-NEXT: v_writelane_b32 v22, s16, 26
-; CHECK-NEXT: v_writelane_b32 v22, s17, 27
-; CHECK-NEXT: v_writelane_b32 v22, s18, 28
-; CHECK-NEXT: v_writelane_b32 v22, s19, 29
+; CHECK-NEXT: v_writelane_b32 v22, s4, 10
+; CHECK-NEXT: v_writelane_b32 v22, s5, 11
+; CHECK-NEXT: v_writelane_b32 v22, s6, 12
+; CHECK-NEXT: v_writelane_b32 v22, s7, 13
+; CHECK-NEXT: v_writelane_b32 v22, s8, 14
+; CHECK-NEXT: v_writelane_b32 v22, s9, 15
+; CHECK-NEXT: v_writelane_b32 v22, s10, 16
+; CHECK-NEXT: v_writelane_b32 v22, s11, 17
+; CHECK-NEXT: v_writelane_b32 v22, s12, 18
+; CHECK-NEXT: v_writelane_b32 v22, s13, 19
+; CHECK-NEXT: v_writelane_b32 v22, s14, 20
+; CHECK-NEXT: v_writelane_b32 v22, s15, 21
+; CHECK-NEXT: v_writelane_b32 v22, s16, 22
+; CHECK-NEXT: v_writelane_b32 v22, s17, 23
+; CHECK-NEXT: v_writelane_b32 v22, s18, 24
+; CHECK-NEXT: v_writelane_b32 v22, s19, 25
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[42:43]
+; CHECK-NEXT: ; def s[38:39]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[52:55]
+; CHECK-NEXT: ; def s[44:47]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[4:11]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s4, 30
-; CHECK-NEXT: v_writelane_b32 v22, s5, 31
-; CHECK-NEXT: v_writelane_b32 v22, s6, 32
-; CHECK-NEXT: v_writelane_b32 v22, s7, 33
-; CHECK-NEXT: v_writelane_b32 v22, s8, 34
-; CHECK-NEXT: v_writelane_b32 v22, s9, 35
-; CHECK-NEXT: v_writelane_b32 v22, s10, 36
-; CHECK-NEXT: v_writelane_b32 v22, s11, 37
+; CHECK-NEXT: v_writelane_b32 v22, s4, 26
+; CHECK-NEXT: v_writelane_b32 v22, s5, 27
+; CHECK-NEXT: v_writelane_b32 v22, s6, 28
+; CHECK-NEXT: v_writelane_b32 v22, s7, 29
+; CHECK-NEXT: v_writelane_b32 v22, s8, 30
+; CHECK-NEXT: v_writelane_b32 v22, s9, 31
+; CHECK-NEXT: v_writelane_b32 v22, s10, 32
+; CHECK-NEXT: v_writelane_b32 v22, s11, 33
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[16:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[40:41]
+; CHECK-NEXT: ; def s[36:37]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[36:39]
+; CHECK-NEXT: ; def s[40:43]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[44:51]
+; CHECK-NEXT: ; def s[0:7]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_writelane_b32 v22, s0, 34
+; CHECK-NEXT: v_writelane_b32 v22, s1, 35
+; CHECK-NEXT: v_writelane_b32 v22, s2, 36
+; CHECK-NEXT: v_writelane_b32 v22, s3, 37
+; CHECK-NEXT: v_writelane_b32 v22, s4, 38
+; CHECK-NEXT: v_writelane_b32 v22, s5, 39
+; CHECK-NEXT: v_writelane_b32 v22, s6, 40
+; CHECK-NEXT: v_writelane_b32 v22, s7, 41
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s0, 38
-; CHECK-NEXT: v_writelane_b32 v22, s1, 39
-; CHECK-NEXT: v_writelane_b32 v22, s2, 40
-; CHECK-NEXT: v_writelane_b32 v22, s3, 41
-; CHECK-NEXT: v_writelane_b32 v22, s4, 42
-; CHECK-NEXT: v_writelane_b32 v22, s5, 43
-; CHECK-NEXT: v_writelane_b32 v22, s6, 44
-; CHECK-NEXT: v_writelane_b32 v22, s7, 45
-; CHECK-NEXT: v_writelane_b32 v22, s8, 46
-; CHECK-NEXT: v_writelane_b32 v22, s9, 47
-; CHECK-NEXT: v_writelane_b32 v22, s10, 48
-; CHECK-NEXT: v_writelane_b32 v22, s11, 49
-; CHECK-NEXT: v_writelane_b32 v22, s12, 50
-; CHECK-NEXT: v_writelane_b32 v22, s13, 51
-; CHECK-NEXT: v_writelane_b32 v22, s14, 52
-; CHECK-NEXT: v_writelane_b32 v22, s15, 53
+; CHECK-NEXT: v_writelane_b32 v22, s0, 42
+; CHECK-NEXT: v_writelane_b32 v22, s1, 43
+; CHECK-NEXT: v_writelane_b32 v22, s2, 44
+; CHECK-NEXT: v_writelane_b32 v22, s3, 45
+; CHECK-NEXT: v_writelane_b32 v22, s4, 46
+; CHECK-NEXT: v_writelane_b32 v22, s5, 47
+; CHECK-NEXT: v_writelane_b32 v22, s6, 48
+; CHECK-NEXT: v_writelane_b32 v22, s7, 49
+; CHECK-NEXT: v_writelane_b32 v22, s8, 50
+; CHECK-NEXT: v_writelane_b32 v22, s9, 51
+; CHECK-NEXT: v_writelane_b32 v22, s10, 52
+; CHECK-NEXT: v_writelane_b32 v22, s11, 53
+; CHECK-NEXT: v_writelane_b32 v22, s12, 54
+; CHECK-NEXT: v_writelane_b32 v22, s13, 55
+; CHECK-NEXT: v_writelane_b32 v22, s14, 56
+; CHECK-NEXT: v_writelane_b32 v22, s15, 57
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[34:35]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s0, 54
-; CHECK-NEXT: v_writelane_b32 v22, s1, 55
-; CHECK-NEXT: v_writelane_b32 v22, s2, 56
-; CHECK-NEXT: v_writelane_b32 v22, s3, 57
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[0:7]
-; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_writelane_b32 v22, s0, 58
; CHECK-NEXT: v_writelane_b32 v22, s1, 59
; CHECK-NEXT: v_writelane_b32 v22, s2, 60
-; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v22, s3, 61
-; CHECK-NEXT: v_writelane_b32 v22, s4, 62
-; CHECK-NEXT: v_writelane_b32 v23, s6, 0
-; CHECK-NEXT: v_writelane_b32 v22, s5, 63
-; CHECK-NEXT: v_writelane_b32 v23, s7, 1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[0:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
+; CHECK-NEXT: v_writelane_b32 v22, s0, 62
+; CHECK-NEXT: v_writelane_b32 v23, s2, 0
+; CHECK-NEXT: v_writelane_b32 v23, s3, 1
+; CHECK-NEXT: v_writelane_b32 v23, s4, 2
+; CHECK-NEXT: v_writelane_b32 v23, s5, 3
+; CHECK-NEXT: v_writelane_b32 v23, s6, 4
+; CHECK-NEXT: v_writelane_b32 v22, s1, 63
+; CHECK-NEXT: v_writelane_b32 v23, s7, 5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 2
-; CHECK-NEXT: v_writelane_b32 v23, s1, 3
-; CHECK-NEXT: v_writelane_b32 v23, s2, 4
-; CHECK-NEXT: v_writelane_b32 v23, s3, 5
-; CHECK-NEXT: v_writelane_b32 v23, s4, 6
-; CHECK-NEXT: v_writelane_b32 v23, s5, 7
-; CHECK-NEXT: v_writelane_b32 v23, s6, 8
-; CHECK-NEXT: v_writelane_b32 v23, s7, 9
-; CHECK-NEXT: v_writelane_b32 v23, s8, 10
-; CHECK-NEXT: v_writelane_b32 v23, s9, 11
-; CHECK-NEXT: v_writelane_b32 v23, s10, 12
-; CHECK-NEXT: v_writelane_b32 v23, s11, 13
-; CHECK-NEXT: v_writelane_b32 v23, s12, 14
-; CHECK-NEXT: v_writelane_b32 v23, s13, 15
-; CHECK-NEXT: v_writelane_b32 v23, s14, 16
-; CHECK-NEXT: v_writelane_b32 v23, s15, 17
+; CHECK-NEXT: v_writelane_b32 v23, s0, 6
+; CHECK-NEXT: v_writelane_b32 v23, s1, 7
+; CHECK-NEXT: v_writelane_b32 v23, s2, 8
+; CHECK-NEXT: v_writelane_b32 v23, s3, 9
+; CHECK-NEXT: v_writelane_b32 v23, s4, 10
+; CHECK-NEXT: v_writelane_b32 v23, s5, 11
+; CHECK-NEXT: v_writelane_b32 v23, s6, 12
+; CHECK-NEXT: v_writelane_b32 v23, s7, 13
+; CHECK-NEXT: v_writelane_b32 v23, s8, 14
+; CHECK-NEXT: v_writelane_b32 v23, s9, 15
+; CHECK-NEXT: v_writelane_b32 v23, s10, 16
+; CHECK-NEXT: v_writelane_b32 v23, s11, 17
+; CHECK-NEXT: v_writelane_b32 v23, s12, 18
+; CHECK-NEXT: v_writelane_b32 v23, s13, 19
+; CHECK-NEXT: v_writelane_b32 v23, s14, 20
+; CHECK-NEXT: v_writelane_b32 v23, s15, 21
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 18
-; CHECK-NEXT: v_writelane_b32 v23, s1, 19
+; CHECK-NEXT: v_writelane_b32 v23, s0, 22
+; CHECK-NEXT: v_writelane_b32 v23, s1, 23
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 20
-; CHECK-NEXT: v_writelane_b32 v23, s1, 21
-; CHECK-NEXT: v_writelane_b32 v23, s2, 22
-; CHECK-NEXT: v_writelane_b32 v23, s3, 23
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[0:7]
-; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_writelane_b32 v23, s0, 24
; CHECK-NEXT: v_writelane_b32 v23, s1, 25
; CHECK-NEXT: v_writelane_b32 v23, s2, 26
; CHECK-NEXT: v_writelane_b32 v23, s3, 27
-; CHECK-NEXT: v_writelane_b32 v23, s4, 28
-; CHECK-NEXT: v_writelane_b32 v23, s5, 29
-; CHECK-NEXT: v_writelane_b32 v23, s6, 30
-; CHECK-NEXT: v_writelane_b32 v23, s7, 31
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[0:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_writelane_b32 v23, s0, 28
+; CHECK-NEXT: v_writelane_b32 v23, s1, 29
+; CHECK-NEXT: v_writelane_b32 v23, s2, 30
+; CHECK-NEXT: v_writelane_b32 v23, s3, 31
+; CHECK-NEXT: v_writelane_b32 v23, s4, 32
+; CHECK-NEXT: v_writelane_b32 v23, s5, 33
+; CHECK-NEXT: v_writelane_b32 v23, s6, 34
+; CHECK-NEXT: v_writelane_b32 v23, s7, 35
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 32
-; CHECK-NEXT: v_writelane_b32 v23, s1, 33
-; CHECK-NEXT: v_writelane_b32 v23, s2, 34
-; CHECK-NEXT: v_writelane_b32 v23, s3, 35
-; CHECK-NEXT: v_writelane_b32 v23, s4, 36
-; CHECK-NEXT: v_writelane_b32 v23, s5, 37
-; CHECK-NEXT: v_writelane_b32 v23, s6, 38
-; CHECK-NEXT: v_writelane_b32 v23, s7, 39
-; CHECK-NEXT: v_writelane_b32 v23, s8, 40
-; CHECK-NEXT: v_writelane_b32 v23, s9, 41
-; CHECK-NEXT: v_writelane_b32 v23, s10, 42
-; CHECK-NEXT: v_writelane_b32 v23, s11, 43
-; CHECK-NEXT: v_writelane_b32 v23, s12, 44
-; CHECK-NEXT: v_writelane_b32 v23, s13, 45
-; CHECK-NEXT: v_writelane_b32 v23, s14, 46
-; CHECK-NEXT: v_writelane_b32 v23, s15, 47
+; CHECK-NEXT: v_writelane_b32 v23, s0, 36
+; CHECK-NEXT: v_writelane_b32 v23, s1, 37
+; CHECK-NEXT: v_writelane_b32 v23, s2, 38
+; CHECK-NEXT: v_writelane_b32 v23, s3, 39
+; CHECK-NEXT: v_writelane_b32 v23, s4, 40
+; CHECK-NEXT: v_writelane_b32 v23, s5, 41
+; CHECK-NEXT: v_writelane_b32 v23, s6, 42
+; CHECK-NEXT: v_writelane_b32 v23, s7, 43
+; CHECK-NEXT: v_writelane_b32 v23, s8, 44
+; CHECK-NEXT: v_writelane_b32 v23, s9, 45
+; CHECK-NEXT: v_writelane_b32 v23, s10, 46
+; CHECK-NEXT: v_writelane_b32 v23, s11, 47
+; CHECK-NEXT: v_writelane_b32 v23, s12, 48
+; CHECK-NEXT: v_writelane_b32 v23, s13, 49
+; CHECK-NEXT: v_writelane_b32 v23, s14, 50
+; CHECK-NEXT: v_writelane_b32 v23, s15, 51
; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %ret
; CHECK-NEXT: s_endpgm
@@ -206,166 +210,170 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: v_readlane_b32 s1, v22, 3
; CHECK-NEXT: v_readlane_b32 s2, v22, 4
; CHECK-NEXT: v_readlane_b32 s3, v22, 5
+; CHECK-NEXT: v_readlane_b32 s4, v22, 6
+; CHECK-NEXT: v_readlane_b32 s5, v22, 7
+; CHECK-NEXT: v_readlane_b32 s6, v22, 8
+; CHECK-NEXT: v_readlane_b32 s7, v22, 9
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[0:3]
+; CHECK-NEXT: ; use s[48:51]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 6
-; CHECK-NEXT: v_readlane_b32 s1, v22, 7
-; CHECK-NEXT: v_readlane_b32 s2, v22, 8
-; CHECK-NEXT: v_readlane_b32 s3, v22, 9
-; CHECK-NEXT: v_readlane_b32 s4, v22, 10
-; CHECK-NEXT: v_readlane_b32 s5, v22, 11
-; CHECK-NEXT: v_readlane_b32 s6, v22, 12
-; CHECK-NEXT: v_readlane_b32 s7, v22, 13
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 14
-; CHECK-NEXT: v_readlane_b32 s1, v22, 15
-; CHECK-NEXT: v_readlane_b32 s2, v22, 16
-; CHECK-NEXT: v_readlane_b32 s3, v22, 17
-; CHECK-NEXT: v_readlane_b32 s4, v22, 18
-; CHECK-NEXT: v_readlane_b32 s5, v22, 19
-; CHECK-NEXT: v_readlane_b32 s6, v22, 20
-; CHECK-NEXT: v_readlane_b32 s7, v22, 21
-; CHECK-NEXT: v_readlane_b32 s8, v22, 22
-; CHECK-NEXT: v_readlane_b32 s9, v22, 23
-; CHECK-NEXT: v_readlane_b32 s10, v22, 24
-; CHECK-NEXT: v_readlane_b32 s11, v22, 25
-; CHECK-NEXT: v_readlane_b32 s12, v22, 26
-; CHECK-NEXT: v_readlane_b32 s13, v22, 27
-; CHECK-NEXT: v_readlane_b32 s14, v22, 28
-; CHECK-NEXT: v_readlane_b32 s15, v22, 29
+; CHECK-NEXT: v_readlane_b32 s0, v22, 10
+; CHECK-NEXT: v_readlane_b32 s1, v22, 11
+; CHECK-NEXT: v_readlane_b32 s2, v22, 12
+; CHECK-NEXT: v_readlane_b32 s3, v22, 13
+; CHECK-NEXT: v_readlane_b32 s4, v22, 14
+; CHECK-NEXT: v_readlane_b32 s5, v22, 15
+; CHECK-NEXT: v_readlane_b32 s6, v22, 16
+; CHECK-NEXT: v_readlane_b32 s7, v22, 17
+; CHECK-NEXT: v_readlane_b32 s8, v22, 18
+; CHECK-NEXT: v_readlane_b32 s9, v22, 19
+; CHECK-NEXT: v_readlane_b32 s10, v22, 20
+; CHECK-NEXT: v_readlane_b32 s11, v22, 21
+; CHECK-NEXT: v_readlane_b32 s12, v22, 22
+; CHECK-NEXT: v_readlane_b32 s13, v22, 23
+; CHECK-NEXT: v_readlane_b32 s14, v22, 24
+; CHECK-NEXT: v_readlane_b32 s15, v22, 25
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 30
-; CHECK-NEXT: v_readlane_b32 s1, v22, 31
-; CHECK-NEXT: v_readlane_b32 s2, v22, 32
-; CHECK-NEXT: v_readlane_b32 s3, v22, 33
-; CHECK-NEXT: v_readlane_b32 s4, v22, 34
-; CHECK-NEXT: v_readlane_b32 s5, v22, 35
-; CHECK-NEXT: v_readlane_b32 s6, v22, 36
-; CHECK-NEXT: v_readlane_b32 s7, v22, 37
+; CHECK-NEXT: v_readlane_b32 s0, v22, 26
+; CHECK-NEXT: v_readlane_b32 s1, v22, 27
+; CHECK-NEXT: v_readlane_b32 s2, v22, 28
+; CHECK-NEXT: v_readlane_b32 s3, v22, 29
+; CHECK-NEXT: v_readlane_b32 s4, v22, 30
+; CHECK-NEXT: v_readlane_b32 s5, v22, 31
+; CHECK-NEXT: v_readlane_b32 s6, v22, 32
+; CHECK-NEXT: v_readlane_b32 s7, v22, 33
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[42:43]
+; CHECK-NEXT: ; use s[38:39]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[52:55]
+; CHECK-NEXT: ; use s[44:47]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 38
-; CHECK-NEXT: v_readlane_b32 s1, v22, 39
-; CHECK-NEXT: v_readlane_b32 s2, v22, 40
-; CHECK-NEXT: v_readlane_b32 s3, v22, 41
+; CHECK-NEXT: v_readlane_b32 s0, v22, 34
+; CHECK-NEXT: v_readlane_b32 s1, v22, 35
+; CHECK-NEXT: v_readlane_b32 s2, v22, 36
+; CHECK-NEXT: v_readlane_b32 s3, v22, 37
+; CHECK-NEXT: v_readlane_b32 s4, v22, 38
+; CHECK-NEXT: v_readlane_b32 s5, v22, 39
+; CHECK-NEXT: v_readlane_b32 s6, v22, 40
+; CHECK-NEXT: v_readlane_b32 s7, v22, 41
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[16:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[40:41]
+; CHECK-NEXT: ; use s[36:37]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[36:39]
+; CHECK-NEXT: ; use s[40:43]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[44:51]
+; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s4, v22, 42
-; CHECK-NEXT: v_readlane_b32 s5, v22, 43
-; CHECK-NEXT: v_readlane_b32 s6, v22, 44
-; CHECK-NEXT: v_readlane_b32 s7, v22, 45
-; CHECK-NEXT: v_readlane_b32 s8, v22, 46
-; CHECK-NEXT: v_readlane_b32 s9, v22, 47
-; CHECK-NEXT: v_readlane_b32 s10, v22, 48
-; CHECK-NEXT: v_readlane_b32 s11, v22, 49
-; CHECK-NEXT: v_readlane_b32 s12, v22, 50
-; CHECK-NEXT: v_readlane_b32 s13, v22, 51
-; CHECK-NEXT: v_readlane_b32 s14, v22, 52
-; CHECK-NEXT: v_readlane_b32 s15, v22, 53
+; CHECK-NEXT: v_readlane_b32 s0, v22, 42
+; CHECK-NEXT: v_readlane_b32 s1, v22, 43
+; CHECK-NEXT: v_readlane_b32 s2, v22, 44
+; CHECK-NEXT: v_readlane_b32 s3, v22, 45
+; CHECK-NEXT: v_readlane_b32 s4, v22, 46
+; CHECK-NEXT: v_readlane_b32 s5, v22, 47
+; CHECK-NEXT: v_readlane_b32 s6, v22, 48
+; CHECK-NEXT: v_readlane_b32 s7, v22, 49
+; CHECK-NEXT: v_readlane_b32 s8, v22, 50
+; CHECK-NEXT: v_readlane_b32 s9, v22, 51
+; CHECK-NEXT: v_readlane_b32 s10, v22, 52
+; CHECK-NEXT: v_readlane_b32 s11, v22, 53
+; CHECK-NEXT: v_readlane_b32 s12, v22, 54
+; CHECK-NEXT: v_readlane_b32 s13, v22, 55
+; CHECK-NEXT: v_readlane_b32 s14, v22, 56
+; CHECK-NEXT: v_readlane_b32 s15, v22, 57
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 54
-; CHECK-NEXT: v_readlane_b32 s1, v22, 55
-; CHECK-NEXT: v_readlane_b32 s2, v22, 56
-; CHECK-NEXT: v_readlane_b32 s3, v22, 57
+; CHECK-NEXT: v_readlane_b32 s0, v22, 58
+; CHECK-NEXT: v_readlane_b32 s1, v22, 59
+; CHECK-NEXT: v_readlane_b32 s2, v22, 60
+; CHECK-NEXT: v_readlane_b32 s3, v22, 61
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[34:35]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 58
-; CHECK-NEXT: v_readlane_b32 s1, v22, 59
-; CHECK-NEXT: v_readlane_b32 s2, v22, 60
-; CHECK-NEXT: v_readlane_b32 s3, v22, 61
-; CHECK-NEXT: v_readlane_b32 s4, v22, 62
-; CHECK-NEXT: v_readlane_b32 s5, v22, 63
-; CHECK-NEXT: v_readlane_b32 s6, v23, 0
-; CHECK-NEXT: v_readlane_b32 s7, v23, 1
+; CHECK-NEXT: v_readlane_b32 s0, v22, 62
+; CHECK-NEXT: v_readlane_b32 s1, v22, 63
+; CHECK-NEXT: v_readlane_b32 s2, v23, 0
+; CHECK-NEXT: v_readlane_b32 s3, v23, 1
+; CHECK-NEXT: v_readlane_b32 s4, v23, 2
+; CHECK-NEXT: v_readlane_b32 s5, v23, 3
+; CHECK-NEXT: v_readlane_b32 s6, v23, 4
+; CHECK-NEXT: v_readlane_b32 s7, v23, 5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 2
-; CHECK-NEXT: v_readlane_b32 s1, v23, 3
-; CHECK-NEXT: v_readlane_b32 s2, v23, 4
-; CHECK-NEXT: v_readlane_b32 s3, v23, 5
-; CHECK-NEXT: v_readlane_b32 s4, v23, 6
-; CHECK-NEXT: v_readlane_b32 s5, v23, 7
-; CHECK-NEXT: v_readlane_b32 s6, v23, 8
-; CHECK-NEXT: v_readlane_b32 s7, v23, 9
-; CHECK-NEXT: v_readlane_b32 s8, v23, 10
-; CHECK-NEXT: v_readlane_b32 s9, v23, 11
-; CHECK-NEXT: v_readlane_b32 s10, v23, 12
-; CHECK-NEXT: v_readlane_b32 s11, v23, 13
-; CHECK-NEXT: v_readlane_b32 s12, v23, 14
-; CHECK-NEXT: v_readlane_b32 s13, v23, 15
-; CHECK-NEXT: v_readlane_b32 s14, v23, 16
-; CHECK-NEXT: v_readlane_b32 s15, v23, 17
+; CHECK-NEXT: v_readlane_b32 s0, v23, 6
+; CHECK-NEXT: v_readlane_b32 s1, v23, 7
+; CHECK-NEXT: v_readlane_b32 s2, v23, 8
+; CHECK-NEXT: v_readlane_b32 s3, v23, 9
+; CHECK-NEXT: v_readlane_b32 s4, v23, 10
+; CHECK-NEXT: v_readlane_b32 s5, v23, 11
+; CHECK-NEXT: v_readlane_b32 s6, v23, 12
+; CHECK-NEXT: v_readlane_b32 s7, v23, 13
+; CHECK-NEXT: v_readlane_b32 s8, v23, 14
+; CHECK-NEXT: v_readlane_b32 s9, v23, 15
+; CHECK-NEXT: v_readlane_b32 s10, v23, 16
+; CHECK-NEXT: v_readlane_b32 s11, v23, 17
+; CHECK-NEXT: v_readlane_b32 s12, v23, 18
+; CHECK-NEXT: v_readlane_b32 s13, v23, 19
+; CHECK-NEXT: v_readlane_b32 s14, v23, 20
+; CHECK-NEXT: v_readlane_b32 s15, v23, 21
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 18
-; CHECK-NEXT: v_readlane_b32 s1, v23, 19
+; CHECK-NEXT: v_readlane_b32 s0, v23, 22
+; CHECK-NEXT: v_readlane_b32 s1, v23, 23
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 20
-; CHECK-NEXT: v_readlane_b32 s1, v23, 21
-; CHECK-NEXT: v_readlane_b32 s2, v23, 22
-; CHECK-NEXT: v_readlane_b32 s3, v23, 23
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[0:3]
-; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_readlane_b32 s0, v23, 24
; CHECK-NEXT: v_readlane_b32 s1, v23, 25
; CHECK-NEXT: v_readlane_b32 s2, v23, 26
; CHECK-NEXT: v_readlane_b32 s3, v23, 27
-; CHECK-NEXT: v_readlane_b32 s4, v23, 28
-; CHECK-NEXT: v_readlane_b32 s5, v23, 29
-; CHECK-NEXT: v_readlane_b32 s6, v23, 30
-; CHECK-NEXT: v_readlane_b32 s7, v23, 31
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_readlane_b32 s0, v23, 28
+; CHECK-NEXT: v_readlane_b32 s1, v23, 29
+; CHECK-NEXT: v_readlane_b32 s2, v23, 30
+; CHECK-NEXT: v_readlane_b32 s3, v23, 31
+; CHECK-NEXT: v_readlane_b32 s4, v23, 32
+; CHECK-NEXT: v_readlane_b32 s5, v23, 33
+; CHECK-NEXT: v_readlane_b32 s6, v23, 34
+; CHECK-NEXT: v_readlane_b32 s7, v23, 35
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 32
-; CHECK-NEXT: v_readlane_b32 s1, v23, 33
-; CHECK-NEXT: v_readlane_b32 s2, v23, 34
-; CHECK-NEXT: v_readlane_b32 s3, v23, 35
-; CHECK-NEXT: v_readlane_b32 s4, v23, 36
-; CHECK-NEXT: v_readlane_b32 s5, v23, 37
-; CHECK-NEXT: v_readlane_b32 s6, v23, 38
-; CHECK-NEXT: v_readlane_b32 s7, v23, 39
-; CHECK-NEXT: v_readlane_b32 s8, v23, 40
-; CHECK-NEXT: v_readlane_b32 s9, v23, 41
-; CHECK-NEXT: v_readlane_b32 s10, v23, 42
-; CHECK-NEXT: v_readlane_b32 s11, v23, 43
-; CHECK-NEXT: v_readlane_b32 s12, v23, 44
-; CHECK-NEXT: v_readlane_b32 s13, v23, 45
-; CHECK-NEXT: v_readlane_b32 s14, v23, 46
-; CHECK-NEXT: v_readlane_b32 s15, v23, 47
+; CHECK-NEXT: v_readlane_b32 s0, v23, 36
+; CHECK-NEXT: v_readlane_b32 s1, v23, 37
+; CHECK-NEXT: v_readlane_b32 s2, v23, 38
+; CHECK-NEXT: v_readlane_b32 s3, v23, 39
+; CHECK-NEXT: v_readlane_b32 s4, v23, 40
+; CHECK-NEXT: v_readlane_b32 s5, v23, 41
+; CHECK-NEXT: v_readlane_b32 s6, v23, 42
+; CHECK-NEXT: v_readlane_b32 s7, v23, 43
+; CHECK-NEXT: v_readlane_b32 s8, v23, 44
+; CHECK-NEXT: v_readlane_b32 s9, v23, 45
+; CHECK-NEXT: v_readlane_b32 s10, v23, 46
+; CHECK-NEXT: v_readlane_b32 s11, v23, 47
+; CHECK-NEXT: v_readlane_b32 s12, v23, 48
+; CHECK-NEXT: v_readlane_b32 s13, v23, 49
+; CHECK-NEXT: v_readlane_b32 s14, v23, 50
+; CHECK-NEXT: v_readlane_b32 s15, v23, 51
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
@@ -426,4 +434,4 @@ ret:
}
attributes #0 = { nounwind }
-attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" "amdgpu-no-flat-scratch-init" }
+attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" }
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index 461500c8e740c..65a17ed67481c 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) #0 {
+define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
; GCN-LABEL: v_shl_i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26,7 +26,7 @@ define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) #0 {
ret i128 %shl
}
-define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) #0 {
+define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) {
; GCN-LABEL: v_lshr_i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52,7 +52,7 @@ define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) #0 {
ret i128 %shl
}
-define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) #0 {
+define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) {
; GCN-LABEL: v_ashr_i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -79,7 +79,7 @@ define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) #0 {
}
-define i128 @v_shl_i128_vk(i128 %lhs) #0 {
+define i128 @v_shl_i128_vk(i128 %lhs) {
; GCN-LABEL: v_shl_i128_vk:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -93,7 +93,7 @@ define i128 @v_shl_i128_vk(i128 %lhs) #0 {
ret i128 %shl
}
-define i128 @v_lshr_i128_vk(i128 %lhs) #0 {
+define i128 @v_lshr_i128_vk(i128 %lhs) {
; GCN-LABEL: v_lshr_i128_vk:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -106,7 +106,7 @@ define i128 @v_lshr_i128_vk(i128 %lhs) #0 {
ret i128 %shl
}
-define i128 @v_ashr_i128_vk(i128 %lhs) #0 {
+define i128 @v_ashr_i128_vk(i128 %lhs) {
; GCN-LABEL: v_ashr_i128_vk:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -120,7 +120,7 @@ define i128 @v_ashr_i128_vk(i128 %lhs) #0 {
ret i128 %shl
}
-define i128 @v_shl_i128_kv(i128 %rhs) #0 {
+define i128 @v_shl_i128_kv(i128 %rhs) {
; GCN-LABEL: v_shl_i128_kv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -142,7 +142,7 @@ define i128 @v_shl_i128_kv(i128 %rhs) #0 {
ret i128 %shl
}
-define i128 @v_lshr_i128_kv(i128 %rhs) #0 {
+define i128 @v_lshr_i128_kv(i128 %rhs) {
; GCN-LABEL: v_lshr_i128_kv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -161,7 +161,7 @@ define i128 @v_lshr_i128_kv(i128 %rhs) #0 {
ret i128 %shl
}
-define i128 @v_ashr_i128_kv(i128 %rhs) #0 {
+define i128 @v_ashr_i128_kv(i128 %rhs) {
; GCN-LABEL: v_ashr_i128_kv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -178,12 +178,14 @@ define i128 @v_ashr_i128_kv(i128 %rhs) #0 {
ret i128 %shl
}
-define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) #0 {
+define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-LABEL: s_shl_i128_ss:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s5, s4, 64
; GCN-NEXT: s_sub_i32 s12, 64, s4
@@ -203,6 +205,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) #0 {
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -211,12 +214,14 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) #0 {
ret void
}
-define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) #0 {
+define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-LABEL: s_lshr_i128_ss:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s5, s4, 64
; GCN-NEXT: s_sub_i32 s12, 64, s4
@@ -236,6 +241,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) #0 {
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -244,12 +250,14 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) #0 {
ret void
}
-define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) #0 {
+define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-LABEL: s_ashr_i128_ss:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s5, 64, s4
; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
@@ -270,6 +278,7 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) #0 {
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -278,7 +287,7 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) #0 {
ret void
}
-define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
+define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: v_shl_v2i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -327,7 +336,7 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
ret <2 x i128> %shl
}
-define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
+define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: v_lshr_v2i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -376,7 +385,7 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
ret <2 x i128> %shl
}
-define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
+define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: v_ashr_v2i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -427,9 +436,12 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
ret <2 x i128> %shl
}
-define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
+define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: s_shl_v2i128ss:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v6, 16
; GCN-NEXT: v_mov_b32_e32 v4, 0
@@ -499,9 +511,12 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
ret void
}
-define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
+define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: s_lshr_v2i128_ss:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v6, 16
; GCN-NEXT: v_mov_b32_e32 v4, 0
@@ -571,9 +586,12 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) #0
ret void
}
-define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) #0 {
+define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: s_ashr_v2i128_ss:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v6, 16
; GCN-NEXT: v_mov_b32_e32 v4, 0
@@ -645,4 +663,3 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) #0
ret void
}
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 0c029db96f558..46f257eff1f24 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -4,11 +4,14 @@
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) #0 {
+define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) {
; CI-LABEL: sint_to_fp_i32_to_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -20,6 +23,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -33,16 +39,19 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
; uses an SGPR (implicit vcc).
-define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) #0 {
+define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
; CI-LABEL: sint_to_fp_i1_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -52,11 +61,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -67,11 +79,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) #0
ret void
}
-define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) #0 {
+define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) {
; CI-LABEL: sint_to_fp_i1_f64_load:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_bitcmp1_b32 s2, 0
; CI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -86,6 +101,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s2, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -100,10 +118,13 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
ret void
}
-define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) {
; CI-LABEL: s_sint_to_fp_i64_to_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3
; CI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -117,6 +138,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; VI-LABEL: s_sint_to_fp_i64_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -131,11 +155,14 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
ret void
}
-define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; CI-LABEL: v_sint_to_fp_i64_to_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
@@ -155,6 +182,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -178,11 +208,14 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
}
; FIXME: bfe and sext on VI+
-define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) {
; CI-LABEL: s_sint_to_fp_i8_to_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i8 s2, s2
; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
@@ -195,6 +228,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s2, s2, 0x80000
; VI-NEXT: s_sext_i32_i16 s2, s2
@@ -208,7 +244,7 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
ret void
}
-define double @v_sint_to_fp_i8_to_f64(i8 %in) #0 {
+define double @v_sint_to_fp_i8_to_f64(i8 %in) {
; CI-LABEL: v_sint_to_fp_i8_to_f64:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -227,16 +263,19 @@ define double @v_sint_to_fp_i8_to_f64(i8 %in) #0 {
ret double %fp
}
-define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
+define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
; CI-LABEL: s_select_sint_to_fp_i1_vals_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -246,11 +285,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -261,7 +303,7 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
ret void
}
-define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
+define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: v_select_sint_to_fp_i1_vals_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -278,16 +320,19 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0
ret void
}
-define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) #0 {
+define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
; CI-LABEL: s_select_sint_to_fp_i1_vals_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -297,11 +342,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -312,7 +360,7 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
ret void
}
-define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) #0 {
+define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: v_select_sint_to_fp_i1_vals_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -330,7 +378,7 @@ define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) #0
}
; TODO: This should swap the selected order / invert the compare and do it.
-define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
+define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: v_swap_select_sint_to_fp_i1_vals_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -348,16 +396,19 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in
}
; TODO: This should swap the selected order / invert the compare and do it.
-define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
+define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
; CI-LABEL: s_swap_select_sint_to_fp_i1_vals_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0, 0xbff00000
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -367,11 +418,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0, 0xbff00000
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -381,5 +435,3 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
store double %select, ptr addrspace(1) %out, align 8
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 0689bb4fb75eb..bd255e88b9512 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %13.sub0
+ ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %14.sub0
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
- ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
- ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %13
+ ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %14
; GCN-NEXT: S_ENDPGM 0
%v0 = call i32 asm sideeffect "; def $0", "=v"()
%tmp = insertelement <2 x i32> poison, i32 %v0, i32 0
@@ -27,4 +27,4 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
-attributes #0 = { nounwind "amdgpu-num-vgpr"="5" "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind "amdgpu-num-vgpr"="5" }
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 0e67d7c6530c8..ef92cf3214e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -50,7 +50,10 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 {
define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
; HAWAII-LABEL: local_store_i55:
; HAWAII: ; %bb.0:
+; HAWAII-NEXT: s_add_i32 s12, s12, s17
; HAWAII-NEXT: s_or_b32 s0, s8, 14
+; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s9
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
@@ -70,7 +73,10 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
;
; FIJI-LABEL: local_store_i55:
; FIJI: ; %bb.0:
+; FIJI-NEXT: s_add_i32 s12, s12, s17
; FIJI-NEXT: s_or_b32 s0, s8, 14
+; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s9
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
@@ -380,4 +386,4 @@ define void @local_store_i17(ptr addrspace(3) %ptr, i17 %arg) #0 {
ret void
}
-attributes #0 = { nounwind "amdgpu-no-flat-scratch-init" }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index d9613e7cda9c9..30accc846d2b6 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @...............
+; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 9
@@ -23,7 +23,7 @@ entry:
ret void
}
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 94bb08d24153a..4f84b31f1877b 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000 ................
+; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 5
@@ -23,7 +23,7 @@ entry:
ret void
}
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 82ba126bc0962..644f434923368 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @...............
+; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 9
@@ -23,7 +23,7 @@ entry:
ret void
}
-attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 43f028cf8649c..69cc63eba6243 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -9,7 +9,7 @@
declare void @llvm.trap() #0
declare void @llvm.debugtrap() #1
-define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #2 {
+define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
; NOHSA-TRAP-GFX900-LABEL: trap:
; NOHSA-TRAP-GFX900: ; %bb.0:
; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -23,11 +23,14 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #2 {
; HSA-TRAP-GFX803-LABEL: trap:
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7]
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3
+; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7]
; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX803-NEXT: s_trap 2
@@ -100,7 +103,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #2 {
ret void
}
-define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr #2 {
+define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; NOHSA-TRAP-GFX900-LABEL: non_entry_trap:
; NOHSA-TRAP-GFX900: ; %bb.0: ; %entry
; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -121,6 +124,9 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
; HSA-TRAP-GFX803-LABEL: non_entry_trap:
; HSA-TRAP-GFX803: ; %bb.0: ; %entry
; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1
@@ -261,7 +267,7 @@ ret:
ret void
}
-define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) #2 {
+define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after:
; NOHSA-TRAP-GFX900: ; %bb.0:
; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -280,6 +286,9 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7]
; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5
@@ -394,7 +403,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
ret void
}
-define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) #2 {
+define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
; NOHSA-TRAP-GFX900-LABEL: debugtrap:
; NOHSA-TRAP-GFX900: ; %bb.0:
; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -411,10 +420,13 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
; HSA-TRAP-GFX803-LABEL: debugtrap:
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1
; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
@@ -484,7 +496,6 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
attributes #0 = { nounwind noreturn }
attributes #1 = { nounwind }
-attributes #2 = { "amdgpu-no-flat-scratch-init" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 93dda473ffd82..698288bb3c9f4 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -5,7 +5,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GFX1030
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
-define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: udiv_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -81,6 +81,9 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-LABEL: udiv_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -182,7 +185,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
ret void
}
-define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
; SI-LABEL: s_udiv_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -252,6 +255,9 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0
; GCN-LABEL: s_udiv_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3
; GCN-NEXT: s_sub_i32 s4, 0, s3
@@ -343,7 +349,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0
; The code generated by udiv is long and complex and may frequently
; change. The goal of this test is to make sure the ISel doesn't fail
; when it gets a v4i32 udiv
-define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: udiv_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -457,6 +463,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: udiv_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -616,7 +625,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}
-define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: udiv_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -810,6 +819,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: udiv_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s2, 16
; GCN-NEXT: s_addc_u32 s5, s3, 0
@@ -1095,7 +1107,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}
-define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: udiv_i32_div_pow2:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1135,6 +1147,9 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
; GCN-LABEL: udiv_i32_div_pow2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1180,7 +1195,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
ret void
}
-define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: udiv_i32_div_k_even:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1224,6 +1239,9 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: udiv_i32_div_k_even:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1274,7 +1292,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
ret void
}
-define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: udiv_i32_div_k_odd:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1318,6 +1336,9 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; GCN-LABEL: udiv_i32_div_k_odd:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1368,7 +1389,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
ret void
}
-define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: v_udiv_i8:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1430,6 +1451,9 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; GCN-LABEL: v_udiv_i8:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1508,7 +1532,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
ret void
}
-define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: v_udiv_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1570,6 +1594,9 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: v_udiv_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1648,7 +1675,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}
-define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: v_udiv_i23:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1726,6 +1753,9 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: v_udiv_i23:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s2, 4
; GCN-NEXT: s_addc_u32 s5, s3, 0
@@ -1845,7 +1875,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}
-define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: v_udiv_i24:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1923,6 +1953,9 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: v_udiv_i24:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s2, 4
; GCN-NEXT: s_addc_u32 s5, s3, 0
@@ -2045,7 +2078,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}
-define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) #0 {
+define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) {
; SI-LABEL: scalarize_mulhu_4xi32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -2105,6 +2138,9 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; GCN-LABEL: scalarize_mulhu_4xi32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
@@ -2190,7 +2226,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
ret void
}
-define amdgpu_kernel void @test_udiv2(i32 %p) #0 {
+define amdgpu_kernel void @test_udiv2(i32 %p) {
; SI-LABEL: test_udiv2:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s0, s[4:5], 0x9
@@ -2218,6 +2254,9 @@ define amdgpu_kernel void @test_udiv2(i32 %p) #0 {
; GCN-LABEL: test_udiv2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshr_b32 s0, s0, 1
; GCN-NEXT: v_mov_b32_e32 v0, s0
@@ -2250,7 +2289,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) #0 {
ret void
}
-define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) #0 {
+define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; SI-LABEL: test_udiv_3_mulhu:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s0, s[4:5], 0x9
@@ -2281,6 +2320,9 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0
@@ -2316,7 +2358,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) #0 {
ret void
}
-define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readonly %arg) #0 {
+define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readonly %arg) {
; SI-LABEL: fdiv_test_denormals:
; SI: ; %bb.0: ; %bb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
@@ -2378,6 +2420,9 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
; GCN-LABEL: fdiv_test_denormals:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
@@ -2487,7 +2532,7 @@ bb:
ret void
}
-define i64 @v_test_udiv64_mulhi_fold(i64 %arg) #0 {
+define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
; SI-LABEL: v_test_udiv64_mulhi_fold:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2576,5 +2621,3 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) #0 {
%d = udiv i64 %arg, 100000
ret i64 %d
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 56f74f59b711a..97738a7944741 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -4,11 +4,14 @@
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: v_uint_to_fp_i64_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
@@ -28,6 +31,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -50,10 +56,13 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) {
; SI-LABEL: s_uint_to_fp_i64_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -67,6 +76,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; VI-LABEL: s_uint_to_fp_i64_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -81,11 +93,14 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 x i64> %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 x i64> %in) {
; SI-LABEL: s_uint_to_fp_v2i64_to_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1
@@ -103,6 +118,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1
@@ -123,11 +141,14 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) {
; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8
; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s1
@@ -160,6 +181,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s7
; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s5
@@ -191,11 +215,14 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_uint_to_fp_i32_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; SI-NEXT: v_mov_b32_e32 v3, s1
@@ -207,6 +234,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -218,10 +248,13 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 x i32> %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 x i32> %in) {
; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3
; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
@@ -234,11 +267,14 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 x i32> %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 x i32> %in) {
; SI-LABEL: s_uint_to_fp_v4i32_to_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3
@@ -259,6 +295,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3
@@ -281,16 +320,19 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
; We can't fold the SGPRs into v_cndmask_b32_e32, because it already
; uses an SGPR (implicit vcc).
-define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) #0 {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: uint_to_fp_i1_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -300,11 +342,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -315,11 +360,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
ret void
}
-define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %in) #0 {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %in) {
; SI-LABEL: uint_to_fp_i1_to_f64_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s2, 0
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -334,6 +382,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s2, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -348,11 +399,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
ret void
}
-define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) {
; SI-LABEL: s_uint_to_fp_i8_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s2, s2, 0xff
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
@@ -365,6 +419,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xff
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
@@ -378,7 +435,7 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
}
; FIXME: Worse on VI
-define double @v_uint_to_fp_i8_to_f64(i8 %in) #0 {
+define double @v_uint_to_fp_i8_to_f64(i8 %in) {
; SI-LABEL: v_uint_to_fp_i8_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -397,16 +454,19 @@ define double @v_uint_to_fp_i8_to_f64(i8 %in) #0 {
ret double %fp
}
-define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
+define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_select_uint_to_fp_i1_vals_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -416,11 +476,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -431,7 +494,7 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
ret void
}
-define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
+define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: v_select_uint_to_fp_i1_vals_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -448,16 +511,19 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0
ret void
}
-define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) #0 {
+define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_select_uint_to_fp_i1_vals_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -467,11 +533,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -482,7 +551,7 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
ret void
}
-define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) #0 {
+define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: v_select_uint_to_fp_i1_vals_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -500,16 +569,19 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) #0
}
; TODO: This should swap the selected order / invert the compare and do it.
-define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
+define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -519,11 +591,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -534,7 +609,7 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
ret void
}
-define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) #0 {
+define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: v_swap_select_uint_to_fp_i1_vals_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -550,5 +625,3 @@ define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in
store double %select, ptr addrspace(1) %out, align 8
ret void
}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
>From baf06e5cb18f0a467143cbb96ef14d03706c702c Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Mon, 24 Mar 2025 14:38:58 -0700
Subject: [PATCH 6/6] Update tests.
Renamed attributor-flatscratchinit-invalid.ll to
attributor-flatscratchinit-undefined-behavior.ll. Added RUN lines.
---
.../attributor-flatscratchinit-invalid2.ll | 313 -------
...tor-flatscratchinit-undefined-behavior.ll} | 0
...tor-flatscratchinit-undefined-behavior2.ll | 870 ++++++++++++++++++
3 files changed, 870 insertions(+), 313 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid2.ll
rename llvm/test/CodeGen/AMDGPU/{attributor-flatscratchinit-invalid.ll => attributor-flatscratchinit-undefined-behavior.ll} (100%)
create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid2.ll
deleted file mode 100644
index d9486c5c78223..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid2.ll
+++ /dev/null
@@ -1,313 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
-
-;
-; None of these functions should have the attribute amdgpu-no-flat-scratch-init. In these tests
-; we manually set the attribute for the functions. The purpose is to test how llc handles this.
-;
-
-;; tests of addrspacecast
-
-define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
-; GFX9-LABEL: with_private_to_flat_addrspacecast:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT: flat_store_dword v[0:1], v2
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: with_private_to_flat_addrspacecast:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0
-; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
-; GFX10-NEXT: flat_store_dword v[0:1], v2
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %stof = addrspacecast ptr addrspace(5) %ptr to ptr
- store volatile i32 0, ptr %stof
- ret void
-}
-
-define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 {
-; GFX9-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_cmp_lg_u32 s2, -1
-; GFX9-NEXT: s_cselect_b32 s0, s1, 0
-; GFX9-NEXT: s_cselect_b32 s1, s2, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: flat_store_dword v[0:1], v2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0
-; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lg_u32 s2, -1
-; GFX10-NEXT: s_cselect_b32 s0, s2, 0
-; GFX10-NEXT: s_cselect_b32 s1, s1, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: flat_store_dword v[0:1], v2
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_endpgm
- %stof = addrspacecast ptr addrspace(5) %ptr to ptr
- store volatile i32 0, ptr %stof
- ret void
-}
-
-define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
-; GFX9-LABEL: call_with_private_to_flat_addrspacecast:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[16:17]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_writelane_b32 v3, s30, 0
-; GFX9-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: v_readlane_b32 s31, v3, 1
-; GFX9-NEXT: v_readlane_b32 s30, v3, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_mov_b32 s33, s18
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: call_with_private_to_flat_addrspacecast:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s18, s33
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
-; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s16
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_getpc_b64 s[16:17]
-; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v3, s30, 0
-; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_writelane_b32 v3, s31, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX10-NEXT: v_readlane_b32 s31, v3, 1
-; GFX10-NEXT: v_readlane_b32 s30, v3, 0
-; GFX10-NEXT: s_mov_b32 s32, s33
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_mov_b32 s33, s18
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
- ret void
-}
-
-define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 {
-; GFX9-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 s0, s0, s15
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 s8, s8, 8
-; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s15
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s0, s0, s15
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0
-; GFX10-NEXT: s_add_u32 s8, s8, 8
-; GFX10-NEXT: s_addc_u32 s9, s9, 0
-; GFX10-NEXT: s_getpc_b64 s[16:17]
-; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX10-NEXT: s_mov_b32 s32, 0
-; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s15
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX10-NEXT: s_endpgm
- call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
- ret void
-}
-
-;; tests of addrspacecast in a constant
-
-define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) #0 {
-; GFX9-LABEL: private_constant_expression_use:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_mov_b64 s[2:3], src_private_base
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: private_constant_expression_use:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_endpgm
- store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8
- ret void
-}
-
-;; tests of intrinsics
-
-define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 {
-; GFX9-LABEL: calls_intrin_ascast_cc_kernel:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
-; GFX9-NEXT: s_mov_b64 s[0:1], src_shared_base
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, 7
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: flat_store_dword v[0:1], v2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: calls_intrin_ascast_cc_kernel:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0
-; GFX10-NEXT: s_mov_b64 s[0:1], src_shared_base
-; GFX10-NEXT: v_mov_b32_e32 v2, 7
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: flat_store_dword v[0:1], v2
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_endpgm
- %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
- store volatile i32 7, ptr %1, align 4
- ret void
-}
-
-define void @calls_intrin_ascast(ptr addrspace(3) %ptr) #0 {
-; GFX9-LABEL: calls_intrin_ascast:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, 7
-; GFX9-NEXT: flat_store_dword v[0:1], v2
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: calls_intrin_ascast:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX10-NEXT: v_mov_b32_e32 v2, 7
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: flat_store_dword v[0:1], v2
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
- store volatile i32 7, ptr %1, align 4
- ret void
-}
-
-define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 {
-; GFX9-LABEL: call_calls_intrin_ascast_cc_kernel:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 s0, s0, s15
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 s8, s8, 8
-; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_getpc_b64 s[16:17]
-; GFX9-NEXT: s_add_u32 s16, s16, calls_intrin_ascast at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s15
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: call_calls_intrin_ascast_cc_kernel:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s0, s0, s15
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0
-; GFX10-NEXT: s_add_u32 s8, s8, 8
-; GFX10-NEXT: s_addc_u32 s9, s9, 0
-; GFX10-NEXT: s_getpc_b64 s[16:17]
-; GFX10-NEXT: s_add_u32 s16, s16, calls_intrin_ascast at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast at gotpcrel32@hi+12
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; GFX10-NEXT: s_mov_b32 s32, 0
-; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s15
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX10-NEXT: s_endpgm
- call void @calls_intrin_ascast(ptr addrspace(3) %ptr)
- ret void
-}
-
-attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-invalid.ll
rename to llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
new file mode 100644
index 0000000000000..51caa84450ff3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll
@@ -0,0 +1,870 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GFX8-ARCH-FLAT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GFX9-ARCH-FLAT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GFX942-ARCH-FLAT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+
+;
+; None of these functions should have the attribute amdgpu-no-flat-scratch-init. In these tests
+; we manually set the attribute for the functions. The purpose is to test how llc handles this.
+;
+
+;; tests of addrspacecast
+
+define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
+; GFX8-LABEL: with_private_to_flat_addrspacecast:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0
+; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast:
+; GFX8-ARCH-FLAT: ; %bb.0:
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], 0xc0
+; GFX8-ARCH-FLAT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX8-ARCH-FLAT-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: with_private_to_flat_addrspacecast:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast:
+; GFX9-ARCH-FLAT: ; %bb.0:
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-ARCH-FLAT-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast:
+; GFX942-ARCH-FLAT: ; %bb.0:
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-ARCH-FLAT-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-ARCH-FLAT-NEXT: s_nop 0
+; GFX942-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: with_private_to_flat_addrspacecast:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0
+; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
+; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+ store volatile i32 0, ptr %stof
+ ret void
+}
+
+define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 {
+; GFX8-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX8-NEXT: s_load_dword s1, s[8:9], 0xc8
+; GFX8-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_cmp_lg_u32 s0, -1
+; GFX8-NEXT: s_cselect_b32 s1, s1, 0
+; GFX8-NEXT: s_cselect_b32 s0, s0, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX8-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
+; GFX8-ARCH-FLAT: ; %bb.0:
+; GFX8-ARCH-FLAT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-ARCH-FLAT-NEXT: s_load_dword s1, s[4:5], 0xc8
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: s_cmp_lg_u32 s0, -1
+; GFX8-ARCH-FLAT-NEXT: s_cselect_b32 s1, s1, 0
+; GFX8-ARCH-FLAT-NEXT: s_cselect_b32 s0, s0, 0
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX9-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s2, -1
+; GFX9-NEXT: s_cselect_b32 s0, s1, 0
+; GFX9-NEXT: s_cselect_b32 s1, s2, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX9-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
+; GFX9-ARCH-FLAT: ; %bb.0:
+; GFX9-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: s_cmp_lg_u32 s2, -1
+; GFX9-ARCH-FLAT-NEXT: s_cselect_b32 s0, s1, 0
+; GFX9-ARCH-FLAT-NEXT: s_cselect_b32 s1, s2, 0
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX942-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
+; GFX942-ARCH-FLAT: ; %bb.0:
+; GFX942-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: s_cmp_lg_u32 s2, -1
+; GFX942-ARCH-FLAT-NEXT: s_cselect_b32 s0, s1, 0
+; GFX942-ARCH-FLAT-NEXT: s_cselect_b32 s1, s2, 0
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX10-LABEL: with_private_to_flat_addrspacecast_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_cmp_lg_u32 s2, -1
+; GFX10-NEXT: s_cselect_b32 s0, s2, 0
+; GFX10-NEXT: s_cselect_b32 s1, s1, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+ %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+ store volatile i32 0, ptr %stof
+ ret void
+}
+
+define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
+; GFX8-LABEL: call_with_private_to_flat_addrspacecast:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s18, s33
+; GFX8-NEXT: s_mov_b32 s33, s32
+; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT: s_mov_b64 exec, s[16:17]
+; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: s_getpc_b64 s[16:17]
+; GFX8-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX8-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX8-NEXT: v_writelane_b32 v3, s30, 0
+; GFX8-NEXT: v_writelane_b32 v3, s31, 1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT: v_readlane_b32 s31, v3, 1
+; GFX8-NEXT: v_readlane_b32 s30, v3, 0
+; GFX8-NEXT: s_mov_b32 s32, s33
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b32 s33, s18
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast:
+; GFX8-ARCH-FLAT: ; %bb.0:
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s2, s33
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s33, s32
+; GFX8-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX8-ARCH-FLAT-NEXT: s_add_i32 s3, s33, 8
+; GFX8-ARCH-FLAT-NEXT: scratch_store_dword off, v3, s3 ; 4-byte Folded Spill
+; GFX8-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1]
+; GFX8-ARCH-FLAT-NEXT: s_add_i32 s32, s32, 16
+; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[0:1]
+; GFX8-ARCH-FLAT-NEXT: s_add_u32 s0, s0, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0
+; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
+; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s32, s33
+; GFX8-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX8-ARCH-FLAT-NEXT: s_add_i32 s3, s33, 8
+; GFX8-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s3 ; 4-byte Folded Reload
+; GFX8-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1]
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s33, s2
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: call_with_private_to_flat_addrspacecast:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s18, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[16:17]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[16:17]
+; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT: v_writelane_b32 v3, s30, 0
+; GFX9-NEXT: v_writelane_b32 v3, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: v_readlane_b32 s31, v3, 1
+; GFX9-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b32 s33, s18
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast:
+; GFX9-ARCH-FLAT: ; %bb.0:
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s2, s33
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s33, s32
+; GFX9-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX9-ARCH-FLAT-NEXT: scratch_store_dword off, v3, s33 ; 4-byte Folded Spill
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-ARCH-FLAT-NEXT: s_add_i32 s32, s32, 16
+; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[0:1]
+; GFX9-ARCH-FLAT-NEXT: s_add_u32 s0, s0, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0
+; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
+; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s32, s33
+; GFX9-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX9-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s33 ; 4-byte Folded Reload
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s33, s2
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast:
+; GFX942-ARCH-FLAT: ; %bb.0:
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s2, s33
+; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s33, s32
+; GFX942-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX942-ARCH-FLAT-NEXT: scratch_store_dword off, v3, s33 ; 4-byte Folded Spill
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1]
+; GFX942-ARCH-FLAT-NEXT: s_add_i32 s32, s32, 16
+; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[0:1]
+; GFX942-ARCH-FLAT-NEXT: s_add_u32 s0, s0, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0
+; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1
+; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0
+; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s32, s33
+; GFX942-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX942-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s33 ; 4-byte Folded Reload
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1]
+; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s33, s2
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: call_with_private_to_flat_addrspacecast:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s18, s33
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_xor_saveexec_b32 s16, -1
+; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s16
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_getpc_b64 s[16:17]
+; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v3, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX10-NEXT: v_writelane_b32 v3, s31, 1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: v_readlane_b32 s31, v3, 1
+; GFX10-NEXT: v_readlane_b32 s30, v3, 0
+; GFX10-NEXT: s_mov_b32 s32, s33
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_mov_b32 s33, s18
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+ ret void
+}
+
+define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 {
+; GFX8-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s0, s15
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_load_dword s15, s[8:9], 0x0
+; GFX8-NEXT: s_add_u32 s8, s8, 8
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_getpc_b64 s[16:17]
+; GFX8-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX8-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s15
+; GFX8-NEXT: s_mov_b32 s32, 0
+; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT: s_endpgm
+;
+; GFX8-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel:
+; GFX8-ARCH-FLAT: ; %bb.0:
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s12, s8
+; GFX8-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s13, s9
+; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0
+; GFX8-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
+; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5]
+; GFX8-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
+; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s32, 0
+; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX8-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX9-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_u32 s0, s0, s15
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 s8, s8, 8
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: s_getpc_b64 s[16:17]
+; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s15
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_endpgm
+;
+; GFX9-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel:
+; GFX9-ARCH-FLAT: ; %bb.0:
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s12, s8
+; GFX9-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s13, s9
+; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0
+; GFX9-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
+; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5]
+; GFX9-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-ARCH-FLAT-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s32, 0
+; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX942-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel:
+; GFX942-ARCH-FLAT: ; %bb.0:
+; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s12, s8
+; GFX942-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8
+; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s13, s9
+; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0
+; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
+; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5]
+; GFX942-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v31, v0
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15
+; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s32, 0
+; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX942-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX10-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s0, s0, s15
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0
+; GFX10-NEXT: s_add_u32 s8, s8, 8
+; GFX10-NEXT: s_addc_u32 s9, s9, 0
+; GFX10-NEXT: s_getpc_b64 s[16:17]
+; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast at gotpcrel32@hi+12
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT: s_mov_b32 s32, 0
+; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s15
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: s_endpgm
+ call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+ ret void
+}
+
+;; tests of addrspacecast in a constant
+
+define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) #0 {
+; GFX8-LABEL: private_constant_expression_use:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s2, s[8:9], 0xc8
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX8-ARCH-FLAT-LABEL: private_constant_expression_use:
+; GFX8-ARCH-FLAT: ; %bb.0:
+; GFX8-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0xc8
+; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s2
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-ARCH-FLAT-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX9-LABEL: private_constant_expression_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX9-ARCH-FLAT-LABEL: private_constant_expression_use:
+; GFX9-ARCH-FLAT: ; %bb.0:
+; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX942-ARCH-FLAT-LABEL: private_constant_expression_use:
+; GFX942-ARCH-FLAT: ; %bb.0:
+; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX10-LABEL: private_constant_expression_use:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+ store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8
+ ret void
+}
+
+;; tests of intrinsics
+
+define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 {
+; GFX8-LABEL: calls_intrin_ascast_cc_kernel:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX8-NEXT: s_load_dword s1, s[8:9], 0xcc
+; GFX8-NEXT: v_mov_b32_e32 v2, 7
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX8-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel:
+; GFX8-ARCH-FLAT: ; %bb.0:
+; GFX8-ARCH-FLAT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-ARCH-FLAT-NEXT: s_load_dword s1, s[4:5], 0xcc
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX9-LABEL: calls_intrin_ascast_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, 7
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX9-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel:
+; GFX9-ARCH-FLAT: ; %bb.0:
+; GFX9-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX942-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel:
+; GFX942-ARCH-FLAT: ; %bb.0:
+; GFX942-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX10-LABEL: calls_intrin_ascast_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX10-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX10-NEXT: v_mov_b32_e32 v2, 7
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+ %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
+ store volatile i32 7, ptr %1, align 4
+ ret void
+}
+
+define void @calls_intrin_ascast(ptr addrspace(3) %ptr) #0 {
+; GFX8-LABEL: calls_intrin_ascast:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b64 s[4:5], 0xc4
+; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v2, 7
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-ARCH-FLAT-LABEL: calls_intrin_ascast:
+; GFX8-ARCH-FLAT: ; %bb.0:
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], 0xc4
+; GFX8-ARCH-FLAT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: calls_intrin_ascast:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, 7
+; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-ARCH-FLAT-LABEL: calls_intrin_ascast:
+; GFX9-ARCH-FLAT: ; %bb.0:
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7
+; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-ARCH-FLAT-LABEL: calls_intrin_ascast:
+; GFX942-ARCH-FLAT: ; %bb.0:
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7
+; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: calls_intrin_ascast:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX10-NEXT: v_mov_b32_e32 v2, 7
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
+ store volatile i32 7, ptr %1, align 4
+ ret void
+}
+
+define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 {
+; GFX8-LABEL: call_calls_intrin_ascast_cc_kernel:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s0, s15
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_load_dword s15, s[8:9], 0x0
+; GFX8-NEXT: s_add_u32 s8, s8, 8
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_getpc_b64 s[16:17]
+; GFX8-NEXT: s_add_u32 s16, s16, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX8-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast at gotpcrel32@hi+12
+; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s15
+; GFX8-NEXT: s_mov_b32 s32, 0
+; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX8-NEXT: s_endpgm
+;
+; GFX8-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel:
+; GFX8-ARCH-FLAT: ; %bb.0:
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s12, s8
+; GFX8-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s13, s9
+; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0
+; GFX8-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
+; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5]
+; GFX8-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast at gotpcrel32@hi+12
+; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
+; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15
+; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s32, 0
+; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX8-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX9-LABEL: call_calls_intrin_ascast_cc_kernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_u32 s0, s0, s15
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 s8, s8, 8
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: s_getpc_b64 s[16:17]
+; GFX9-NEXT: s_add_u32 s16, s16, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s15
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: s_endpgm
+;
+; GFX9-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel:
+; GFX9-ARCH-FLAT: ; %bb.0:
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s12, s8
+; GFX9-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s13, s9
+; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0
+; GFX9-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
+; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5]
+; GFX9-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast at gotpcrel32@hi+12
+; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-ARCH-FLAT-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15
+; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s32, 0
+; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX942-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel:
+; GFX942-ARCH-FLAT: ; %bb.0:
+; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s12, s8
+; GFX942-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8
+; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s13, s9
+; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0
+; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0
+; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5]
+; GFX942-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast at gotpcrel32@hi+12
+; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s14, s10
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v31, v0
+; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15
+; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s32, 0
+; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX942-ARCH-FLAT-NEXT: s_endpgm
+;
+; GFX10-LABEL: call_calls_intrin_ascast_cc_kernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s0, s0, s15
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0
+; GFX10-NEXT: s_add_u32 s8, s8, 8
+; GFX10-NEXT: s_addc_u32 s9, s9, 0
+; GFX10-NEXT: s_getpc_b64 s[16:17]
+; GFX10-NEXT: s_add_u32 s16, s16, calls_intrin_ascast at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast at gotpcrel32@hi+12
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT: s_mov_b32 s32, 0
+; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s15
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX10-NEXT: s_endpgm
+ call void @calls_intrin_ascast(ptr addrspace(3) %ptr)
+ ret void
+}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
More information about the llvm-commits
mailing list